Add the implementation of evaluation workflow

lingzhq · lingzhq · commit fa53d9838984 · 2025-07-29T21:22:45.000+08:00
diff --git a/pyproject.toml b/pyproject.toml
@@ -66,7 +66,8 @@ dev = [
     "pytest>=8.0.0",
     "pytest-json-ctrf",
     "parameterized",
-    "matplotlib"
+    "matplotlib",
+    "word2number",
 ]
 
 doc = [
diff --git a/tests/explorer/workflow_test.py b/tests/explorer/workflow_test.py
@@ -11,6 +11,7 @@
 from trinity.common.rewards import RMGalleryFn
 from trinity.common.workflows import (
     MathBoxedWorkflow,
+    MathEvalWorkflow,
     MathRMWorkflow,
     MathWorkflow,
     Workflow,
@@ -272,6 +273,36 @@ def test_rm_gallery_workflow(self) -> None:
         self.assertEqual(experiences[2].reward, 1.0)
         self.assertEqual(experiences[3].reward, 0.0)
 
+    def test_math_eval_workflow(self) -> None:
+        model = MagicMock()
+        model.chat.return_value = [
+            MockResponse("My step-by-step reasoning leads to the answer \boxed{36}"),
+            MockResponse("Here is the answer of \boxed{36.0}"),
+            MockResponse("I made a mistake, the answer is \boxed{42}"),
+            MockResponse("The answer is 36, but I forgot the box."),
+        ]
+
+        taskset_config = get_unittest_dataset_config("countdown")
+        task = Task(
+            workflow=MathEvalWorkflow,
+            is_eval=True,
+            format_args=taskset_config.format,
+            raw_task={
+                taskset_config.format.prompt_key: "",
+                taskset_config.format.response_key: "36",
+            },
+        )
+
+        workflow = task.to_workflow(model=model)
+        experiences = workflow.run()
+        self.assertEqual(len(experiences), 4)
+        expected_accuracies = [1.0, 1.0, 0.0, 0.0]
+        for i, (exp, expected_acc) in enumerate(zip(experiences, expected_accuracies)):
+            with self.subTest(f"Response {i}"):
+                self.assertEqual(exp.reward, 0.0)
+                assert exp.metrics is not None, f"Metrics for response {i} should not be None"
+                self.assertEqual(exp.metrics["accuracy"], expected_acc)
+
     def test_workflow_resettable(self) -> None:
         model = MagicMock()
         json_task = Task(
diff --git a/tests/template/config.yaml b/tests/template/config.yaml
@@ -34,6 +34,7 @@ buffer:
       path: 'placeholder'
       split: 'train'
     default_workflow_type: ''
+    default_eval_type: ''
     default_reward_fn_type: ''
 explorer:
   eval_interval: 100
diff --git a/tests/test_data/template.yaml b/tests/test_data/template.yaml
@@ -11,6 +11,7 @@ buffer:
       storage_type: file
       path: ''
     default_workflow_type: ''
+    default_eval_type: ''
     default_reward_fn_type: ''
 explorer:
   runner_num: 8
diff --git a/tests/utils/eval_utils_test.py b/tests/utils/eval_utils_test.py
@@ -0,0 +1,97 @@
+# -*- coding: utf-8 -*-
+"""Test for the evaluation utils module."""
+
+import unittest
+
+from trinity.utils.eval_utils import is_equiv
+from trinity.utils.math_eval_utils import extract_answer, verify_math_answer
+
+
+class TestMathEvalUtils(unittest.TestCase):
+    def test_extract_answer(self):
+        test_cases = [
+            ("The answer is \\boxed{42}", "42", "Basic boxed extraction"),
+            ("The result is \\boxed{\\frac{1}{2}}", "\\frac{1}{2}", "Boxed with LaTeX"),
+            ("Therefore, the final answer is 100.", "100", "English 'answer is' extraction"),
+            ("My final answer is: 3.14", "3.14", "English 'answer is' with colon"),
+            ("所以，答案是x^2", "x^2", "Chinese 'answer is' extraction"),
+            (
+                "The cost is 10 dollars and the profit is 20 dollars.",
+                "20",
+                "Extract the last number",
+            ),
+            (
+                "There are 1,000 apples and 2,000 oranges.",
+                "2000",
+                "Extract the last number with commas",
+            ),
+            ("The probability is 0.75.", "0.75", "Extract the last decimal"),
+            ("This sentence has no answer.", None, "No answer case"),
+            ("The box is empty \\boxed{}", None, "Empty boxed"),
+            (12345, None, "Input is not a string"),
+        ]
+
+        for i, (input_str, expected_output, description) in enumerate(test_cases):
+            with self.subTest(f"Case {i+1}: {description}"):
+                actual_output = extract_answer(input_str)
+                self.assertEqual(
+                    actual_output,
+                    expected_output,
+                    "Failed on input: '{input_str}'\nExpected: '{expected_output}', Got: '{actual_output}'",
+                )
+
+    def test_verify_math_answer(self):
+        test_cases = [
+            ("The answer is \\boxed{42}", "42", True, "Simple integer equality"),
+            ("The result is 1,000.", "1000", True, "Number with commas"),
+            ("The answer is -50.", "-50", True, "Negative number equality"),
+            ("The solution is 5", "x=5", True, "Equivalence of value and equation"),
+            ("The answer is \\boxed{42}", "43", False, "Simple numerical inequality"),
+            ("The answer is \\boxed{x+1}", "x-1", False, "Symbolic expression inequality"),
+            (
+                "The matrix is \\boxed{\\begin{pmatrix}1 & 1 \\\\ 0 & 1\\end{pmatrix}}",
+                "\\begin{pmatrix}1&0\\\\0&1\\end{pmatrix}",
+                False,
+                "Matrix inequality",
+            ),
+            ("The speed is 50 km/h", "50", True, "Judgment after stripping units"),
+        ]
+
+        for i, (response, ground_truth, expected_correct, description) in enumerate(test_cases):
+            with self.subTest(f"Case {i+1}: {description}"):
+                accuracy, details = verify_math_answer(response, ground_truth)
+                is_correct = accuracy == 1.0
+                self.assertEqual(
+                    is_correct,
+                    expected_correct,
+                    f"Failed on response: '{response}' with truth: '{ground_truth}'\n"
+                    f"Expected correct: {expected_correct}, Got: {is_correct}\nDetails: {details}",
+                )
+
+
+if __name__ == "__main__":
+    unittest.main()
+
+
+class TestEvalUtils(unittest.TestCase):
+    def test_is_equiv(self):
+        test_cases = [
+            # str1, str2, expected_output, description
+            ("  123  ", "123", True, "Equivalence with whitespace"),
+            ("50%", "50", True, "Equivalence with percentage sign"),
+            ("$50", "50", True, "Equivalence with dollar sign"),
+            ("hello", "world", False, "Basic inequality"),
+            ("123", "1234", False, "Numerical inequality"),
+            (None, None, True, "Both inputs are None"),
+            ("Some string", None, False, "One input is None (str1)"),
+            (None, "Some string", False, "One input is None (str2)"),
+        ]
+
+        for i, (str1, str2, expected_output, description) in enumerate(test_cases):
+            with self.subTest(f"Case {i+1}: {description}"):
+                actual_output = is_equiv(str1, str2)
+                self.assertEqual(
+                    actual_output,
+                    expected_output,
+                    f"Failed on inputs: ('{str1}', '{str2}')\nExpected: {expected_output}, Got: {actual_output}",
+                )
diff --git a/trinity/buffer/reader/file_reader.py b/trinity/buffer/reader/file_reader.py
@@ -287,6 +287,9 @@ def __init__(self, meta: StorageConfig, config: BufferConfig):
 
         self.task_type = meta.task_type
         self.default_workflow_cls = WORKFLOWS.get(meta.default_workflow_type)  # type: ignore
+        self.default_eval_workflow_cls = None
+        if getattr(meta, "default_eval_type", None):
+            self.default_eval_workflow_cls = WORKFLOWS.get(meta.default_eval_type)
         self.default_reward_fn_cls = REWARD_FUNCTIONS.get(meta.default_reward_fn_type)  # type: ignore
 
     def read(
@@ -296,11 +299,14 @@ def read(
         tasks = []
         samples = self.dataset.read_batch(batch_size)
         for sample in samples:
-            workflow_class = (
-                WORKFLOWS.get(sample[self.workflow_key])
-                if self.workflow_key in sample
-                else self.default_workflow_cls
-            )
+            if self.task_type == TaskType.EVAL and self.default_eval_workflow_cls:
+                workflow_class = self.default_eval_workflow_cls
+            else:
+                workflow_class = (
+                    WORKFLOWS.get(sample[self.workflow_key])
+                    if self.workflow_key in sample
+                    else self.default_workflow_cls
+                )
             reward_fn = (
                 REWARD_FUNCTIONS.get(sample[self.reward_fn_key])
                 if self.reward_fn_key in sample
diff --git a/trinity/common/config.py b/trinity/common/config.py
@@ -97,6 +97,7 @@ class StorageConfig:
 
     # used for rollout tasks
     default_workflow_type: Optional[str] = None
+    default_eval_type: Optional[str] = None
     default_reward_fn_type: Optional[str] = None
     rollout_args: GenerationConfig = field(default_factory=GenerationConfig)
     workflow_args: dict = field(default_factory=dict)
@@ -275,6 +276,7 @@ class ExplorerInput:
     eval_tasksets: List[StorageConfig] = field(default_factory=list)
     # The following args provide default values for the corresponding args in `taskset` and `eval_tasksets`
     default_workflow_type: Optional[str] = None
+    default_eval_type: Optional[str] = None
     default_reward_fn_type: Optional[str] = None
     system_prompt: Optional[str] = None
     reply_prefix: Optional[str] = None
@@ -485,6 +487,10 @@ def _check_buffer(self) -> None:  # noqa: C901
             self.buffer.explorer_input.taskset.default_workflow_type = (
                 self.buffer.explorer_input.default_workflow_type
             )
+        if self.buffer.explorer_input.taskset.default_eval_type is None:
+            self.buffer.explorer_input.taskset.default_eval_type = (
+                self.buffer.explorer_input.default_eval_type
+            )
         if self.buffer.explorer_input.taskset.default_reward_fn_type is None:
             self.buffer.explorer_input.taskset.default_reward_fn_type = (
                 self.buffer.explorer_input.default_reward_fn_type
@@ -510,6 +516,8 @@ def _check_buffer(self) -> None:  # noqa: C901
                 dataset.name = f"eval_taskset_{idx}"
             if dataset.default_workflow_type is None:
                 dataset.default_workflow_type = self.buffer.explorer_input.default_workflow_type
+            if dataset.default_eval_type is None:
+                dataset.default_eval_type = self.buffer.explorer_input.default_eval_type
             if dataset.default_reward_fn_type is None:
                 dataset.default_reward_fn_type = self.buffer.explorer_input.default_reward_fn_type
             if dataset.format.system_prompt is None:
diff --git a/trinity/common/workflows/__init__.py b/trinity/common/workflows/__init__.py
@@ -5,6 +5,7 @@
 from .envs.alfworld.alfworld_workflow import AlfworldWorkflow
 from .envs.sciworld.sciworld_workflow import SciWorldWorkflow
 from .envs.webshop.webshop_workflow import WebShopWorkflow
+from .eval_workflow import MathEvalWorkflow
 from .math_rm_workflow import MathRMWorkflow
 from .workflow import WORKFLOWS, MathWorkflow, SimpleWorkflow, Task, Workflow
 
@@ -20,4 +21,5 @@
     "MathBoxedWorkflow",
     "MathRMWorkflow",
     "ToolCallWorkflow",
+    "MathEvalWorkflow",
 ]
diff --git a/trinity/common/workflows/eval_workflow.py b/trinity/common/workflows/eval_workflow.py
@@ -0,0 +1,83 @@
+# -*- coding: utf-8 -*-
+"""Evaluation Workflow Class"""
+
+from dataclasses import asdict
+from typing import List, Optional
+
+import openai
+
+from trinity.common.config import GenerationConfig
+from trinity.common.experience import Experience
+from trinity.common.models.model import ModelWrapper
+from trinity.common.workflows.workflow import WORKFLOWS, Task, Workflow
+from trinity.utils.log import get_logger
+from trinity.utils.math_eval_utils import verify_math_answer
+
+logger = get_logger(__name__)
+
+
+@WORKFLOWS.register_module("math_eval_workflow")
+class MathEvalWorkflow(Workflow):
+    """
+    A workflow for standard math evaluation.
+
+    The evaluation standard and prompting style are follow the Qwen2.5-Math
+    model's evaluation methodology. For more details on their approach, see:
+    https://github.com/QwenLM/Qwen2.5-Math
+    """
+
+    def __init__(
+        self,
+        *,
+        task: Task,
+        model: ModelWrapper,
+        auxiliary_models: Optional[List[openai.OpenAI]] = None,
+    ):
+        super().__init__(
+            task=task,
+            model=model,
+            auxiliary_models=auxiliary_models,
+        )
+
+        self.raw_task = task.raw_task
+        self.truth = task.truth
+
+        # TODO: customize the config in the yaml
+        self.eval_gen_args = asdict(GenerationConfig(temperature=0.6, top_p=0.8, logprobs=0, n=1))
+
+    @property
+    def resettable(self):
+        return False
+
+    def format_messages(self):
+        """Format message for the evaluation of qwen_boxed type."""
+        if not self.raw_task or "question" not in self.raw_task:
+            raise ValueError("Raw task data must contain a 'question' field for MathEvalWorkflow.")
+
+        problem_input = self.raw_task["question"]
+
+        system_prompt = "You are a helpful assistant."
+        user_prompt = f"{problem_input}\nPlease reason step by step, and put your final answer within \\boxed{{}}."
+
+        messages = [
+            {"role": "system", "content": system_prompt},
+            {"role": "user", "content": user_prompt},
+        ]
+        return messages
+
+    def run(self) -> List[Experience]:
+        messages = self.format_messages()
+
+        responses: List[Experience] = self.model.chat(messages, **self.eval_gen_args)
+
+        for response in responses:
+            accuracy, eval_details = verify_math_answer(
+                response_text=response.response_text, ground_truth=self.task.truth
+            )
+
+            acc_metrics = {"accuracy": accuracy}
+            if response.metrics is None:
+                response.metrics = {}
+            response.metrics.update(acc_metrics)
+
+        return responses
diff --git a/trinity/explorer/explorer.py b/trinity/explorer/explorer.py
@@ -271,6 +271,12 @@ def eval(self):
             self.logger.warning("No evaluation data samples. Skip evaluation.")
             return
         self.logger.info(f"Evaluation at step {self.explore_step_num} started.")
+
+        if self.config.buffer.explorer_input.default_eval_type:
+            self.logger.info(
+                f"Use the evaluation: '{self.config.buffer.explorer_input.default_eval_type}'."
+            )
+
         for eval_taskset_config in self.config.buffer.explorer_input.eval_tasksets:
             self.logger.info(
                 f"Evaluation on {eval_taskset_config.name} at step {self.explore_step_num} started."
diff --git a/trinity/manager/config_manager.py b/trinity/manager/config_manager.py
@@ -142,7 +142,7 @@ def beginner_mode(self):
         if st.session_state["sft_warmup_steps"] > 0:
             self.get_configs("sft_warmup_dataset_args")
 
-        self.get_configs("default_workflow_type", "default_reward_fn_type")
+        self.get_configs("default_workflow_type", "default_eval_type", "default_reward_fn_type")
 
         self.get_configs(
             "actor_ppo_micro_batch_size_per_gpu",
@@ -166,7 +166,7 @@ def _expert_model_part(self):
     def _expert_buffer_part(self):
         self.get_configs("total_epochs", "train_batch_size")
 
-        self.get_configs("default_workflow_type", "default_reward_fn_type")
+        self.get_configs("default_workflow_type", "default_eval_type", "default_reward_fn_type")
         self.get_configs("system_prompt")
         self.get_configs("reply_prefix")
 
@@ -544,6 +544,7 @@ def _gen_buffer_config(self):
                 },
                 "eval_tasksets": [],
                 "default_workflow_type": st.session_state["default_workflow_type"],
+                "default_eval_type": st.session_state["default_eval_type"],
                 "default_reward_fn_type": st.session_state["default_reward_fn_type"],
                 "system_prompt": st.session_state["system_prompt"],
                 "reply_prefix": st.session_state["reply_prefix"],
diff --git a/trinity/utils/eval_utils.py b/trinity/utils/eval_utils.py
diff --git a/trinity/utils/math_eval_utils.py b/trinity/utils/math_eval_utils.py

Original file line number	Diff line number	Diff line change
`@@ -66,7 +66,8 @@ dev = [`
`66`	`66`	`"pytest>=8.0.0",`
`67`	`67`	`"pytest-json-ctrf",`
`68`	`68`	`"parameterized",`
`69`		`- "matplotlib"`
	`69`	`+ "matplotlib",`
	`70`	`+ "word2number",`
`70`	`71`	`]`
`71`	`72`
`72`	`73`	`doc = [`