[Tunix] Add special handling for math answer grading.

wang2yn84 · The tunix Authors · commit 3b31371ad64a · 2026-03-25T22:55:32.000-07:00
PiperOrigin-RevId: 889627979
diff --git a/examples/deepscaler/math_eval_nb.py b/examples/deepscaler/math_eval_nb.py
@@ -152,9 +152,11 @@ def evaluate_correctness(response: Any, ground_truths: Any) -> bool:
     return False
   # Check against all possible correct answers
   for ground_truth in processed_ground_truths:
-    is_correct = math_utils.grade_answer_mathd(
-        model_answer, ground_truth
-    ) or math_utils.grade_answer_sympy(model_answer, ground_truth)
+    is_correct = (
+        math_utils.grade_answer_mathd(model_answer, ground_truth)
+        or math_utils.grade_answer_sympy(model_answer, ground_truth)
+        or math_utils.grade_answer_special_handling(model_answer, ground_truth)
+    )
     if is_correct:
       print(f" {model_answer=} {ground_truth=} IS CORRECT")
       return True
diff --git a/tests/utils/math_utils_test.py b/tests/utils/math_utils_test.py
@@ -0,0 +1,72 @@
+# Copyright 2025 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Tests for tunix.utils.math_utils special handling."""
+
+from absl.testing import absltest
+from absl.testing import parameterized
+from tunix.utils import math_utils
+
+
+class MathUtilsSpecialHandlingTest(parameterized.TestCase):
+
+  @parameterized.named_parameters(
+      dict(
+          testcase_name="recurring_decimal_overlap",
+          given_answer="16.67",
+          ground_truth=r"16.\overline{6}",
+          expected=True,
+      ),
+      dict(
+          testcase_name="recurring_decimal_all_single_digit_pattern",
+          given_answer="2.33",
+          ground_truth=r"2.\overline{3}",
+          expected=True,
+      ),
+      dict(
+          testcase_name="recurring_decimal_all_single_digit_pattern2",
+          given_answer="2.3",
+          ground_truth=r"2.\overline{3}",
+          expected=True,
+      ),
+      dict(
+          testcase_name="invalid_sqrt_cleanup_equivalent",
+          given_answer=r"\frac{3\sqrt{3}}{2}",
+          ground_truth=r"\frac{3\sqrt{}{3}}{2}",
+          expected=True,
+      ),
+      dict(
+          testcase_name="interval_union_equivalence",
+          given_answer=r"$-5\lex\le1$or$3\lex\le9$",
+          ground_truth=r"[-5,1]\cup[3,9]",
+          expected=True,
+      ),
+      dict(
+          testcase_name="partial_interval_not_tolerated",
+          given_answer=r"$-5\lex\le1$or$3\lex\le9$",
+          ground_truth=r"-5,1]\cup[3,9]",
+          expected=False,
+      ),
+  )
+  def test_grade_answer_special_handling(
+      self, given_answer: str, ground_truth: str, expected: bool
+  ):
+    self.assertEqual(
+        math_utils.grade_answer_special_handling(given_answer, ground_truth),
+        expected,
+    )
+
+
+if __name__ == "__main__":
+  absltest.main()
diff --git a/tunix/utils/math_rewards.py b/tunix/utils/math_rewards.py
@@ -87,9 +87,13 @@ def math_reward(prompts: List[str], completions: List[str], answer: List[str], *
     for ground_truth in processed_ground_truths:
       if found_correct_answer:
         break
-      is_correct = math_utils.grade_answer_mathd(
-          model_answer, ground_truth
-      ) or math_utils.grade_answer_sympy(model_answer, ground_truth)
+      is_correct = (
+          math_utils.grade_answer_mathd(model_answer, ground_truth)
+          or math_utils.grade_answer_sympy(model_answer, ground_truth)
+          or math_utils.grade_answer_special_handling(
+              model_answer, ground_truth
+          )
+      )
       if is_correct:
         found_correct_answer = True
         reward_value: float = 1.0  # Base reward for a correct answer.
diff --git a/tunix/utils/math_utils.py b/tunix/utils/math_utils.py
@@ -14,6 +14,7 @@
 
 """Math utils for evaluating on Math Dataset like Math500 and AIME2024."""
 
+from decimal import Decimal, ROUND_HALF_UP
 import re
 from absl import logging
 from pylatexenc import latex2text
@@ -438,6 +439,193 @@ def extract_boxed_answer(solution: str):
   return solution
 
 
+def _cleanup_invalid_empty_sqrt(expr: str) -> str:
+  """Fix malformed latex like `\\sqrt{}{3}` -> `\\sqrt{3}`."""
+  return re.sub(r"sqrt\{\}", r"sqrt", expr)
+
+
+def _parse_special_decimal_interval(expr: str):
+  """Parse known recurring-decimal special cases to numeric intervals."""
+  expr = expr.replace("$", "").replace(" ", "")
+  m = re.fullmatch(r"([+-]?\d+)\.([0-9]*)\\overline\{([0-9])\}", expr)
+  if m is not None:
+    int_part = m.group(1)
+    non_repeating_decimals = m.group(2)
+    recurring_digit = m.group(3)
+
+    # Only support single-digit recurring blocks, e.g. `16.\overline{6}`.
+    # Map to the interval formed by 1-decimal and 2-decimal rounded values,
+    # so answers like `16.7` and `16.67` can both match.
+    decimal_places = len(non_repeating_decimals)
+    scale = Decimal(10) ** decimal_places
+    value = (
+        Decimal(int_part)
+        + Decimal(non_repeating_decimals or "0") / scale
+        + Decimal(recurring_digit) / (Decimal(9) * scale)
+    )
+
+    rounded_1 = float(value.quantize(Decimal("0.1"), rounding=ROUND_HALF_UP))
+    rounded_2 = float(value.quantize(Decimal("0.01"), rounding=ROUND_HALF_UP))
+    return (min(rounded_1, rounded_2), max(rounded_1, rounded_2))
+
+  try:
+    value = float(expr)
+    return (value, value)
+  except Exception:
+    return None
+
+
+def _intervals_overlap(
+    interval_a: tuple[float, float], interval_b: tuple[float, float]
+):
+  return not (interval_a[1] < interval_b[0] or interval_b[1] < interval_a[0])
+
+
+def _parse_interval_set(expr: str):
+  """Parse interval unions from either inequality or bracket notation."""
+  expr = expr.lower().strip()
+  expr = expr.replace("$", "")
+  expr = expr.replace("≤", "\\le")
+  expr = expr.replace("\\leq", "\\le")
+  expr = expr.replace("<=", "\\le")
+  expr = expr.replace("\\cup", "|")
+  expr = expr.replace("∪", "|")
+  expr = expr.replace("or", "|")
+  expr = expr.replace(" ", "")
+
+  if not expr:
+    return None
+
+  parts = [part for part in expr.split("|") if part]
+  if not parts:
+    return None
+
+  # First try interval notation: [a,b], (a,b], etc.
+  intervals = []
+  all_interval_notation = True
+  for part in parts:
+    m = re.fullmatch(
+        r"([\[(])([+-]?(?:\d+(?:\.\d+)?|\.\d+)),([+-]?(?:\d+(?:\.\d+)?|\.\d+))([\])])",
+        part,
+    )
+    if m is None:
+      all_interval_notation = False
+      break
+    left = float(m.group(2))
+    right = float(m.group(3))
+    left_closed = m.group(1) == "["
+    right_closed = m.group(4) == "]"
+
+    if left > right:
+      left, right = right, left
+      left_closed, right_closed = right_closed, left_closed
+    intervals.append((left, right, left_closed, right_closed))
+
+  if all_interval_notation:
+    return sorted(intervals)
+
+  # Then try inequalities: -5\lex\le1, -5\lex\le1, etc.
+  intervals = []
+  for part in parts:
+    m = re.fullmatch(
+        r"([+-]?(?:\d+(?:\.\d+)?|\.\d+))\\le[a-z]?\\le([+-]?(?:\d+(?:\.\d+)?|\.\d+))",
+        part,
+    )
+    if m is None:
+      return None
+    left = float(m.group(1))
+    right = float(m.group(2))
+    if left > right:
+      left, right = right, left
+    intervals.append((left, right, True, True))
+
+  return sorted(intervals)
+
+
+def _match_recurring_decimal_special_case(
+    given_clean: str, ground_truth_clean: str
+) -> bool:
+  """Handle recurring decimal overlaps for single-digit overline forms."""
+  if not (
+      re.search(r"[0-9]+\.\s*\\overline\{[0-9]\}", given_clean)
+      or re.search(r"[0-9]+\.\s*\\overline\{[0-9]\}", ground_truth_clean)
+  ):
+    return False
+
+  given_interval = _parse_special_decimal_interval(given_clean)
+  ground_truth_interval = _parse_special_decimal_interval(ground_truth_clean)
+  return (
+      given_interval is not None
+      and ground_truth_interval is not None
+      and _intervals_overlap(given_interval, ground_truth_interval)
+  )
+
+
+def _match_interval_union_special_case(
+    given_clean: str, ground_truth_clean: str
+) -> bool:
+  """Handle inequality unions and interval unions as equivalent sets."""
+  given_intervals = _parse_interval_set(given_clean)
+  ground_truth_intervals = _parse_interval_set(ground_truth_clean)
+  return (
+      given_intervals is not None
+      and ground_truth_intervals is not None
+      and given_intervals == ground_truth_intervals
+  )
+
+
+def _match_invalid_sqrt_special_case(
+    given_answer: str,
+    ground_truth: str,
+    given_clean: str,
+    ground_truth_clean: str,
+) -> bool:
+  """Handle malformed `sqrt{}` cleanup equivalence checks."""
+  if given_clean == given_answer and ground_truth_clean == ground_truth:
+    return False
+
+  given_normalized = _normalize(given_clean)
+  ground_truth_normalized = _normalize(ground_truth_clean)
+  if (
+      given_normalized is not None
+      and ground_truth_normalized is not None
+      and given_normalized == ground_truth_normalized
+  ):
+    return True
+  return (
+      given_normalized is not None
+      and ground_truth_normalized is not None
+      and len(given_normalized) > 0
+      and are_equal_under_sympy(ground_truth_normalized, given_normalized)
+  )
+
+
+def grade_answer_special_handling(given_answer: str, ground_truth: str) -> bool:
+  if given_answer is None or ground_truth is None:
+    return False
+  # Only clean the ground truth for latex errors.
+  ground_truth_clean = _cleanup_invalid_empty_sqrt(ground_truth)
+
+  if given_answer == ground_truth_clean:
+    return True
+
+  # Case 1: recurring decimal overlap special handling.
+  if _match_recurring_decimal_special_case(given_answer, ground_truth_clean):
+    return True
+
+  # Case 2: malformed sqrt{} cleanups should still evaluate as equivalent.
+  if _match_invalid_sqrt_special_case(
+      given_answer, ground_truth, given_answer, ground_truth_clean
+  ):
+    return True
+
+  # Case 3: inequality union vs interval union equivalence.
+  if _match_interval_union_special_case(given_answer, ground_truth_clean):
+    return True
+
+  return False
+
+
 def grade_answer_sympy(given_answer: str, ground_truth: str) -> bool:
   """Grades a given answer against a ground truth using sympy for evaluation."""
   ground_truth_normalized = _normalize(ground_truth)