Rebase and fix some issues

lingzhq · lingzhq · commit e12f4b7101f4 · 2025-07-29T21:34:32.000+08:00
diff --git a/tests/utils/eval_utils_test.py b/tests/utils/eval_utils_test.py
@@ -37,7 +37,7 @@ def test_extract_answer(self):
                 self.assertEqual(
                     actual_output,
                     expected_output,
-                    "Failed on input: '{input_str}'\nExpected: '{expected_output}', Got: '{actual_output}'",
+                    f"Failed on input: '{input_str}'\nExpected: '{expected_output}', Got: '{actual_output}'",
                 )
 
     def test_verify_math_answer(self):
diff --git a/trinity/common/workflows/eval_workflow.py b/trinity/common/workflows/eval_workflow.py
@@ -71,7 +71,10 @@ def run(self) -> List[Experience]:
         responses: List[Experience] = self.model.chat(messages, **self.eval_gen_args)
 
         for response in responses:
-            accuracy, eval_details = verify_math_answer(
+            if response.response_text is None or self.task.truth is None:
+                continue
+
+            accuracy, _ = verify_math_answer(
                 response_text=response.response_text, ground_truth=self.task.truth
             )
 
diff --git a/trinity/utils/math_eval_utils.py b/trinity/utils/math_eval_utils.py
@@ -25,7 +25,7 @@
 from word2number import w2n
 
 
-def verify_math_answer(response_text, ground_truth) -> Tuple[float, Dict[str, Any]]:
+def verify_math_answer(response_text: str, ground_truth: str) -> Tuple[float, Dict[str, Any]]:
     """Strictly compare the equality of response and groundtruth."""
     # Parse the response
     parsed_prediction = extract_answer(response_text)
@@ -234,7 +234,7 @@ def extract_answer(response_text: str) -> Optional[str]:
     "inch",
 ]
 
-unit_texts.extend([t + "s" for t in unit_texts])
+unit_texts.extend([t + "s" for t in unit_texts if not t.endswith("s")])
 
 
 def strip_string(input_str: Optional[str]) -> Optional[str]:
@@ -319,7 +319,7 @@ def fix_fracs(string):
             else:
                 try:
                     assert len(substr) >= 2
-                except Exception:
+                except AssertionError:
                     return string
                 a = substr[0]
                 b = substr[1]

Original file line number	Diff line number	Diff line change
`@@ -37,7 +37,7 @@ def test_extract_answer(self):`
`37`	`37`	`self.assertEqual(`
`38`	`38`	`actual_output,`
`39`	`39`	`expected_output,`
`40`		`- "Failed on input: '{input_str}'\nExpected: '{expected_output}', Got: '{actual_output}'",`
	`40`	`+ f"Failed on input: '{input_str}'\nExpected: '{expected_output}', Got: '{actual_output}'",`
`41`	`41`	`)`
`42`	`42`
`43`	`43`	`def test_verify_math_answer(self):`