Merge branch 'main' into algorithm_dev

pan-x-c · pan-x-c · commit 7bc465c0cae3 · 2025-06-20T17:46:19.000+08:00
diff --git a/README.md b/README.md
@@ -12,7 +12,7 @@
 
 [![paper](http://img.shields.io/badge/cs.LG-2505.17826-B31B1B?logo=arxiv&logoColor=red)](https://arxiv.org/abs/2505.17826)
 [![doc](https://img.shields.io/badge/Docs-blue?logo=markdown)](https://modelscope.github.io/Trinity-RFT/)
-[![pypi](https://img.shields.io/pypi/v/trinity-rft?logo=pypi&color=026cad)](https://pypi.org/project/trinity-rft/0.1.0/)
+[![pypi](https://img.shields.io/pypi/v/trinity-rft?logo=pypi&color=026cad)](https://pypi.org/project/trinity-rft/0.1.1/)
 ![license](https://img.shields.io/badge/license-Apache--2.0-000000.svg)
 
 </div>
@@ -159,7 +159,7 @@ pip install -e .\[flash_attn\]
 Installation using pip:
 
 ```shell
-pip install trinity-rft==0.1.0
+pip install trinity-rft==0.1.1
 ```
 
 Installation from docker:
diff --git a/docs/sphinx_doc/source/main.md b/docs/sphinx_doc/source/main.md
@@ -130,7 +130,7 @@ pip install flash-attn -v
 Installation using pip:
 
 ```shell
-pip install trinity-rft==0.1.0
+pip install trinity-rft==0.1.1
 ```
 
 Installation from docker:
diff --git a/pyproject.toml b/pyproject.toml
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
 
 [project]
 name =  "trinity-rft"
-version = "0.1.0"
+version = "0.1.1"
 authors = [
     {name="Trinity-RFT Team", email="trinity-rft@outlook.com"},
 ]
diff --git a/tests/explorer/workflow_test.py b/tests/explorer/workflow_test.py
@@ -5,7 +5,7 @@
 from unittest.mock import MagicMock
 
 from tests.tools import get_unittest_dataset_config
-from trinity.common.workflows import MathWorkflow, Workflow
+from trinity.common.workflows import MathBoxedWorkflow, MathWorkflow, Workflow
 from trinity.common.workflows.workflow import Task
 
 
@@ -134,6 +134,57 @@ def test_math_complex_workflow(self) -> None:
         self.assertEqual(len(experiences), 1)
         self.assertEqual(experiences[0].reward, 0.9)
 
+    def test_math_boxed_workflow(self) -> None:
+        model = MagicMock()
+        model.chat.return_value = [
+            MockResponse("<think> balabalabala 99 </think>\n \\boxed{36}"),
+            MockResponse("answer is \\boxed{36 }"),
+            MockResponse("Kim's total points are 6 + 30 =\\boxed{36}"),
+            MockResponse("<think> balalaba </think> \\boxed{35.00}"),
+        ]
+        taskset_config = get_unittest_dataset_config("countdown")
+        task = Task(
+            workflow=MathBoxedWorkflow,
+            format_args=taskset_config.format,
+            rollout_args=taskset_config.rollout_args,
+            workflow_args={
+                "with_think": False,
+                "format_score_coef": 0.2,
+            },
+            is_eval=False,
+            raw_task={
+                taskset_config.format.prompt_key: "",
+                taskset_config.format.response_key: r"36",
+            },
+        )
+        workflow = task.to_workflow(model=model)
+        experiences = workflow.run()
+        self.assertEqual(experiences[0].reward, 1.0)
+        self.assertEqual(experiences[1].reward, 1.0)
+        self.assertEqual(experiences[2].reward, 1.0)
+        self.assertEqual(experiences[3].reward, 0.0)
+        task_new = Task(
+            workflow=MathBoxedWorkflow,
+            format_args=taskset_config.format,
+            rollout_args=taskset_config.rollout_args,
+            workflow_args={
+                "with_think": True,
+                "format_score_coef": 0.2,
+            },
+            is_eval=False,
+            raw_task={
+                taskset_config.format.prompt_key: "",
+                taskset_config.format.response_key: r"36",
+            },
+        )
+        workflow.reset(task_new)
+        workflow_new = task_new.to_workflow(model=model)
+        experiences = workflow_new.run()
+        self.assertEqual(experiences[0].reward, 1.0)
+        self.assertEqual(experiences[1].reward, 0.8)
+        self.assertEqual(experiences[2].reward, 0.8)
+        self.assertEqual(experiences[3].reward, 0.0)
+
     def test_gsm8k_workflow(self) -> None:
         model = MagicMock()
         model.chat.return_value = [
diff --git a/trinity/__init__.py b/trinity/__init__.py
@@ -1,4 +1,4 @@
 # -*- coding: utf-8 -*-
 """Trinity-RFT (Reinforcement Fine-Tuning)"""
 
-__version__ = "0.1.0"
+__version__ = "0.1.1"
diff --git a/trinity/buffer/reader/file_reader.py b/trinity/buffer/reader/file_reader.py
@@ -5,6 +5,7 @@
 import datasets
 import transformers
 from datasets import Dataset, load_dataset
+from tqdm import tqdm
 
 from trinity.algorithm.algorithm import DPOAlgorithm, SFTAlgorithm
 from trinity.buffer.buffer_reader import BufferReader
@@ -35,11 +36,20 @@ def __init__(self, dataset: Dataset, max_epoch: int = 1, offset: int = 0):
         for _ in range(self.current_offset):
             next(self.iter)
 
+        # Initialize tqdm progress bar
+        self.total_steps = self.dataset_size * self.max_epoch
+        self.progress_bar = tqdm(
+            total=self.total_steps,
+            initial=self.current_epoch * self.dataset_size + self.current_offset,
+            desc="Dataset Progressing",
+        )
+
     def read_batch(self, batch_size: int) -> List:
         batch = []
 
         while len(batch) < batch_size:
             try:
+                self.progress_bar.update(1)
                 item = next(self.iter)
                 batch.append(item)
                 self.current_offset += 1
@@ -49,7 +59,9 @@ def read_batch(self, batch_size: int) -> List:
                 self.current_offset = 0
 
                 if self.current_epoch >= self.max_epoch:
+                    self.progress_bar.close()
                     raise StopIteration
+                # Step to the next epoch
                 self.iter = iter(self.dataset)
         return batch
 
diff --git a/trinity/common/rewards/reward_fn.py b/trinity/common/rewards/reward_fn.py
@@ -9,10 +9,12 @@
 from math_verify import LatexExtractionConfig, parse, verify
 
 from trinity.utils.eval_utils import (
+    compute_score,
     evaluate_equation,
     extract_solution,
     simple_answer_parser,
     validate_equation,
+    validate_think_pattern,
 )
 from trinity.utils.log import get_logger
 from trinity.utils.registry import Registry
@@ -195,3 +197,33 @@ def __call__(
                 return format_score
         except Exception as e:  # noqa: F841
             return format_score
+
+
+@REWARD_FUNCTIONS.register_module("math_boxed_reward")
+class MathBoxedRewardFn(RewardFn):
+    """A reward function that rewards for math task."""
+
+    def __init__(
+        self,
+    ) -> None:
+        pass
+
+    def __call__(  # type: ignore
+        self,
+        response: str,
+        prompt: Optional[str] = None,
+        truth: Optional[str] = None,
+        return_dict: Optional[bool] = False,
+        with_think: Optional[bool] = False,
+        format_score_coef: Optional[float] = 0.1,
+    ) -> Union[float, dict]:
+        accuracy_score = compute_score(response, truth)
+
+        format_score = 0.0
+        if with_think and not validate_think_pattern(response):
+            format_score = (format_score_coef or 0.1) * -1.0
+
+        if return_dict:
+            return {"accuracy": accuracy_score, "format_score": format_score}
+
+        return accuracy_score + format_score
diff --git a/trinity/common/workflows/__init__.py b/trinity/common/workflows/__init__.py
@@ -1,5 +1,6 @@
 # -*- coding: utf-8 -*-
 """Workflow module"""
+from .customized_math_workflows import MathBoxedWorkflow
 from .envs.alfworld.alfworld_workflow import AlfworldWorkflow
 from .envs.sciworld.sciworld_workflow import SciWorldWorkflow
 from .envs.webshop.webshop_workflow import WebShopWorkflow
@@ -14,4 +15,5 @@
     "WebShopWorkflow",
     "AlfworldWorkflow",
     "SciWorldWorkflow",
+    "MathBoxedWorkflow",
 ]
diff --git a/trinity/common/workflows/customized_math_workflows.py b/trinity/common/workflows/customized_math_workflows.py
@@ -0,0 +1,92 @@
+# -*- coding: utf-8 -*-
+"""We include the customized math workflows in this file."""
+
+from dataclasses import asdict
+from typing import List
+
+from trinity.common.experience import Experience
+from trinity.common.rewards.reward_fn import MathBoxedRewardFn
+from trinity.common.workflows.workflow import WORKFLOWS, SimpleWorkflow, Task
+from trinity.utils.log import get_logger
+
+logger = get_logger(__name__)
+
+
+@WORKFLOWS.register_module("math_boxed_workflow")
+class MathBoxedWorkflow(SimpleWorkflow):
+    """A workflow for math tasks that give answers in boxed format."""
+
+    def reset(self, task: Task):
+        self.format_args = task.format_args
+        self.system_prompt = task.format_args.system_prompt
+        self.reply_prefix = task.format_args.reply_prefix
+
+        self.raw_task = task.raw_task
+        self.task_desc = task.task_desc
+        self.truth = task.truth
+
+        # Rollout args
+        rollout_args = asdict(task.rollout_args)
+        self.rollout_args = rollout_args
+        self.is_eval = task.is_eval
+
+        self.workflow_args = task.workflow_args
+
+        self.use_base = self.workflow_args.get("use_base", False)
+        self.with_think = self.workflow_args.get("with_think", False)
+        self.format_score_coef = self.workflow_args.get("format_score_coef", 0.1)
+
+        default_prompt = (
+            """Please reason step by step, and put your final answer within \\boxed{}."""
+        )
+
+        default_prompt_with_think = """You are a helpful assistant that solves MATH problems. You should first thinks about the reasoning process in mind and then provides the user with the answer. You should present your reasoning process using the format: <think>\n ...your reasoning process here... </think>\n first. You should always include your final answer in \\boxed{} as closed-form results."""
+
+        if self.system_prompt is None:
+            if self.with_think:
+                self.system_prompt = default_prompt_with_think
+            else:
+                self.system_prompt = default_prompt
+
+        self.reward_fn = MathBoxedRewardFn()
+
+    def format_prompt(self):
+        prompt_text = ""
+        if self.system_prompt:
+            prompt_text += "System:" + self.system_prompt
+            prompt_text += "\nUser:\n" + self.task_desc + "\nAssistant:\n"
+        else:
+            prompt_text += "User:\n" + self.task_desc + "\nAssistant:\n"
+        return prompt_text
+
+    def run(self) -> List[Experience]:
+        # TODO: Optimize the generate function
+        if not self.use_base:
+            messages = self.format_messages()
+        else:
+            prompt_text = self.format_prompt()
+
+        logger.debug("start chat")
+        if not self.use_base:
+            responses = self.model.chat(messages, **self.rollout_args)
+        else:
+            responses = self.model.generate([prompt_text], **self.rollout_args)
+
+        for response in responses:
+            reward = MathBoxedRewardFn()(  # type: ignore [misc]
+                response=response.response_text,  # type: ignore [arg-type]
+                truth=self.truth,
+                return_dict=self.is_eval,
+                with_think=self.with_think,
+                format_score_coef=self.format_score_coef,
+            )
+            logger.debug(
+                f"self.task_desc: {self.task_desc}, messages: {messages}, response: {response.response_text}, reward: {reward}"
+            )
+            if isinstance(reward, dict):
+                if response.metrics is None:
+                    response.metrics = {}
+                response.metrics.update(reward)
+                reward = sum(reward.values())
+            response.reward = reward
+        return responses
diff --git a/trinity/utils/eval_utils.py b/trinity/utils/eval_utils.py

Original file line number	Diff line number	Diff line change
`@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"`
`4`	`4`
`5`	`5`	`[project]`
`6`	`6`	`name = "trinity-rft"`
`7`		`-version = "0.1.0"`
	`7`	`+version = "0.1.1"`
`8`	`8`	`authors = [`
`9`	`9`	`{name="Trinity-RFT Team", email="[email protected]"},`
`10`	`10`	`]`