fix rollout, action mask, attention mask bugs

YeAnbang · YeAnbang · commit d47c56356bc2 · 2025-09-19T10:00:07.000+08:00
diff --git a/applications/ColossalChat/coati/distributed/agent/agentic_producer.py b/applications/ColossalChat/coati/distributed/agent/agentic_producer.py
@@ -6,7 +6,6 @@
 
 import ray
 from coati.distributed.agent.base import BaseAgenticProducer
-from transformers import AutoTokenizer
 
 DEFAULT_SYSTEM_MESSAGE = """A conversation between User and Assistant. The user asks a question, and the Assistant solves it. The Assistant first thinks about the reasoning process in the mind and then provides the user with the answer. The reasoning process and answer are enclosed within <reason> </reason> and <answer> </answer> tags, respectively, i.e., <reason> reasoning process here </reason><answer> answer here </answer>."""
 
@@ -88,13 +87,6 @@ def __init__(
         self.tool_workers = tool_workers
         self.agentic_config = model_config if not agentic_config else agentic_config
         self.agentic_config.update({"model": model_config["path"]})
-        tokenizer_path = None
-        if tokenizer_config and "path" in tokenizer_config:
-            tokenizer_path = tokenizer_config["path"]
-        elif "path" in model_config:
-            tokenizer_path = model_config["path"]
-        assert tokenizer_path is not None, "Tokenizer path must be provided either in tokenizer_config or model_config."
-        self.tokenizer = AutoTokenizer.from_pretrained(tokenizer_path, trust_remote_code=True)
         self.tools_schema = []
         self.tool_call_budget = self.agentic_config.get("tool_call_budget", 3)
         self.llm_call_budget = self.agentic_config.get("llm_call_budget", 10)
@@ -258,6 +250,7 @@ def _run_agentic_pipeline(self, messages):
                 )
             )
             llm_call_count += 1
+            self.consumer_global_step = response.pop("consumer_global_step")
             response_input_ids = response["input_ids"]
             logprobs = response["action_log_probs"]
             response_text = self.tokenizer.decode(
diff --git a/applications/ColossalChat/coati/distributed/agent/base.py b/applications/ColossalChat/coati/distributed/agent/base.py
@@ -135,15 +135,13 @@ def rollout(self, **kwargs) -> Dict[str, torch.Tensor]:
             input_ids = torch.nn.functional.pad(
                 response_input_ids, (to_pad_left, to_pad_right), "constant", value=self.tokenizer.pad_token_id
             )  # [1, max_length]
-            attention_mask = torch.nn.functional.pad(
-                torch.ones_like(response_input_ids), (to_pad_left, to_pad_right), "constant", value=0
-            )  # [1, max_length]
-            action_mask = torch.nn.functional.pad(
-                torch.ones(size=(1, response_length)), (0, to_pad_right), "constant", value=0
-            )  # [1, max_length-prompt_length]
+            attention_mask = input_ids.ne(self.tokenizer.pad_token_id).int()  # [1, max_length]
+            action_mask = input_ids[:, max_prompt_length:].ne(self.tokenizer.pad_token_id).int()
             rollouts["attention_mask"].append(attention_mask)
             rollouts["action_mask"].append(action_mask)
-            truncated_logprobs = logprobs[:, :, prompt_length : prompt_length + self.generate_config["max_tokens"]]
+            truncated_logprobs = logprobs[
+                :, :, prompt_length : prompt_length + self.generate_config["max_tokens"]
+            ]  # truncate to max_new_tokens
             logprobs_padded = torch.nn.functional.pad(
                 truncated_logprobs,
                 (0, self.generate_config["max_tokens"] - truncated_logprobs.size(-1)),
@@ -177,7 +175,8 @@ def rollout(self, **kwargs) -> Dict[str, torch.Tensor]:
                             "rollout": self.tokenizer.batch_decode(
                                 rollouts["input_ids"][:, 0], skip_special_tokens=True
                             ),
-                        }
+                        },
+                        ensure_ascii=False,
                     )
                     + "\n"
                 )
diff --git a/applications/ColossalChat/coati/distributed/agent/math_tools.py b/applications/ColossalChat/coati/distributed/agent/math_tools.py
@@ -20,7 +20,7 @@ def run_python_code(code: str) -> str:
         code = code.replace("```python", "```", 1).strip()
     if code.startswith("```py"):  # qwen3 uses ```py
         code = code.replace("```py", "```", 1).strip()
-    return python_repl.run(code, timeout=20)
+    return python_repl.run(code, timeout=30)
 
 
 repl_tool = Tool(
diff --git a/applications/ColossalChat/coati/distributed/consumer.py b/applications/ColossalChat/coati/distributed/consumer.py
@@ -325,7 +325,6 @@ def loop(self) -> None:
                         )  # for setting start index when resuming training
                         if self.rank == 0:
                             print(f"Saved model checkpoint at step {step + 1} in folder {self.save_dir}")
-                    # breakpoint()
                     if (episode != self.num_episodes - 1 or step != self.num_update_per_episode - 1) and (
                         episode != 0 or step >= self.n_behind
                     ):
diff --git a/applications/ColossalChat/coati/distributed/inference_backend.py b/applications/ColossalChat/coati/distributed/inference_backend.py
@@ -409,7 +409,7 @@ async def generate(
             log_probs[generation_id].extend(p)
         self.profiler.exit(f"vllm generate {request_id}")
         # pad them
-        max_len = self.sample_params.max_tokens
+        max_len = sample_params.max_tokens
         action_mask = torch.ones(len(out_tokens), max_len, dtype=attention_mask.dtype)
 
         for i, new_token_ids in enumerate(out_tokens):
diff --git a/applications/ColossalChat/coati/distributed/launch.py b/applications/ColossalChat/coati/distributed/launch.py
@@ -68,7 +68,7 @@ def launch_distributed(
     eval_interval: int = 100,
     eval_save_dir: Optional[str] = None,
     eval_generation_config: Optional[Dict[str, Any]] = None,
-    log_rollout_interval: int = 20,
+    log_rollout_interval: int = 1,
     rollout_save_dir: str = "./rollout",
     enable_profiling: bool = False,
     n_behind: int = 0,
diff --git a/applications/ColossalChat/coati/distributed/producer.py b/applications/ColossalChat/coati/distributed/producer.py
@@ -93,7 +93,14 @@ def __init__(
         reward_model_kwargs = {
             k: v
             for k, v in grpo_config.items()
-            if k in ["soft_over_length_punishment", "max_new_tokens", "cache_length", "code_verifier_api_url"]
+            if k
+            in [
+                "soft_over_length_punishment",
+                "max_new_tokens",
+                "cache_length",
+                "code_verifier_api_url",
+                "forced_patterns",
+            ]
         }
         self.response_format_tags = grpo_config.get("response_format_tags", None)
         if producer_idx == 0 and rollout_log_file is not None:
@@ -103,7 +110,7 @@ def __init__(
                 )
             else:
                 os.makedirs(os.path.dirname(rollout_log_file), exist_ok=True)
-                self.rollout_log_file = open(rollout_log_file, "w", encoding="utf8")
+                self.rollout_log_file = open(rollout_log_file, "a", encoding="utf8")
         if self.producer_idx == 0:
             self.wandb_run = wandb.init(
                 project=project_name,
@@ -260,6 +267,9 @@ def sync_model(self, episode, step) -> None:
             state_dict = ray_broadcast_tensor_dict(
                 None, self.num_producers, device=self.device, group_name="sync_model"
             )
+            print(
+                f"[P{self.producer_idx}] Sync model episode {episode} step {(step + 1) // self.num_microbatches - 1} done"
+            )
             if "consumer_global_step" in state_dict:
                 self.consumer_global_step = state_dict.pop("consumer_global_step").item()
             self.load_state_dict(state_dict)
@@ -498,7 +508,8 @@ def rollout(self, input_ids, attention_mask, **kwargs):
                             "rollout": self.tokenizer.batch_decode(
                                 rollouts["input_ids"][:, 0], skip_special_tokens=True
                             ),
-                        }
+                        },
+                        ensure_ascii=False,
                     )
                     + "\n"
                 )
@@ -583,8 +594,10 @@ def __init__(
         self.eval_generation_config["n"] = 1  # use 1 generation for evaluation
         self.eval_generation_config.update(eval_generation_config)
         self.eval_sample_params = SamplingParams(**self.eval_generation_config)
-        self.ready_processes = 0
-        self.condition = asyncio.Condition()
+        self.ready_processes_sync_model = 0
+        self.ready_processes_sync_data = 0
+        self.sync_model_condition = asyncio.Condition()
+        self.sync_data_condition = asyncio.Condition()
         self.data_ready_for_sending = []
 
     @torch.no_grad()
@@ -613,6 +626,7 @@ async def generate(self, input_ids, attention_mask, **kwargs):
             ).cpu()  # CUDA tensor is not serializable by ray
             for k in rollouts[0].keys()
         }
+        rollouts["consumer_global_step"] = self.consumer_global_step
         return rollouts
 
     @torch.no_grad()
@@ -634,33 +648,33 @@ async def async_sync_model(self, episode, step, num_processes: int = 1) -> None:
         Asyncronous version to sync model from consumer to producer.
         called by another producer, such as agentic producer.
         """
-        async with self.condition:
-            self.ready_processes += 1
+        async with self.sync_model_condition:
+            self.ready_processes_sync_model += 1
             # Wait until all processes are ready
-            if self.ready_processes < num_processes:
-                await self.condition.wait()
+            if self.ready_processes_sync_model < num_processes:
+                await self.sync_model_condition.wait()
 
-            # Only one process should reset `ready_processes` and perform the sync
-            if self.ready_processes == num_processes:
-                self.ready_processes = 0
-                self.condition.notify_all()  # Notify all waiting processes
+            # Only one process should reset `ready_processes_sync_model` and perform the sync
+            if self.ready_processes_sync_model == num_processes:
+                self.ready_processes_sync_model = 0
+                self.sync_model_condition.notify_all()  # Notify all waiting processes
                 self.sync_model(episode, step)
 
     async def async_sync_data(self, data: Dict[str, torch.Tensor], num_processes: int = 1) -> None:
         # merge data dict
-        async with self.condition:
-            self.ready_processes += 1
+        async with self.sync_data_condition:
+            self.ready_processes_sync_data += 1
             if data:
                 self.data_ready_for_sending.append(data)
 
             # Wait until all processes are ready
-            if self.ready_processes < num_processes:
-                await self.condition.wait()
+            if self.ready_processes_sync_data < num_processes:
+                await self.sync_data_condition.wait()
 
             # Only one process should reset `ready_processes` and perform the sync
-            if self.ready_processes == num_processes:  # wait for all producers to join
-                self.ready_processes = 0
-                self.condition.notify_all()
+            if self.ready_processes_sync_data == num_processes:  # wait for all producers to join
+                self.ready_processes_sync_data = 0
+                self.sync_data_condition.notify_all()
                 # merge data for sending
                 if len(self.data_ready_for_sending) >= 1:
                     batch_rollout_data = {}
@@ -856,7 +870,8 @@ async def rollout(self, input_ids, attention_mask, **kwargs):
                             "rollout": self.tokenizer.batch_decode(
                                 rollouts["input_ids"][:, 0], skip_special_tokens=True
                             ),
-                        }
+                        },
+                        ensure_ascii=False,
                     )
                     + "\n"
                 )
diff --git a/applications/ColossalChat/coati/distributed/reward/reward_fn.py b/applications/ColossalChat/coati/distributed/reward/reward_fn.py
@@ -19,6 +19,7 @@
 
 
 import json
+import re
 
 import torch
 from latex2sympy2_extended import NormalizationConfig
@@ -126,6 +127,12 @@ def math_reward_fn(input_ids, gt_answer, response_idx, **kwargs):
 
     format_valid = validate_response_structure(processed_str, kwargs["tags"])
 
+    if "forced_patterns" in kwargs and kwargs["forced_patterns"]:
+        forced_patterns = kwargs["forced_patterns"]
+        format_valid = format_valid and all(
+            [re.search(pattern, decoded_final_answer) is not None for pattern in forced_patterns]
+        )
+
     # Check answer accuracy, answer is considered correct if the answer is correct and the format is valid
     if final_answer is not None:
         if eval_mode or format_valid:
@@ -161,7 +168,7 @@ def boxed_math_reward_fn(input_ids, gt_answer, response_idx, **kwargs):
     tokenizer = kwargs["tokenizer"]
     eval_mode = kwargs.get("eval_mode", False)
     soft_over_length_punishment = kwargs.get("soft_over_length_punishment", False)
-    acc_score = 10.0
+    acc_score = 1.0
     reward = torch.tensor(0.0)
     format_acc = torch.tensor(0.0)
     ans_acc = torch.tensor(0.0)
@@ -182,15 +189,18 @@ def boxed_math_reward_fn(input_ids, gt_answer, response_idx, **kwargs):
         raise ValueError("no gt_answer is provided, please check your training dataset.")
 
     decoded_final_answer = tokenizer.decode(input_ids[s : e + 1], skip_special_tokens=True)
-    # print(f"decoded_final_answer: {decoded_final_answer[-100:]}", gt_answer)
     final_answer = extract_boxed_solution(decoded_final_answer)
     format_valid = final_answer is not None
     if "tags" in kwargs and kwargs["tags"]:
         tags = kwargs["tags"]
         format_valid = format_valid and all(
             [decoded_final_answer.count(tags[tag]["text"]) == tags[tag]["num_occur"] for tag in tags]
         )
-
+    if "forced_patterns" in kwargs and kwargs["forced_patterns"]:
+        forced_patterns = kwargs["forced_patterns"]
+        format_valid = format_valid and all(
+            [re.search(pattern, decoded_final_answer) is not None for pattern in forced_patterns]
+        )
     # Check answer accuracy, answer is considered correct if the answer is correct and the format is valid
     if final_answer is not None:
         if eval_mode or format_valid:
diff --git a/applications/ColossalChat/conversation_template/qwen3.json b/applications/ColossalChat/conversation_template/qwen3.json
@@ -0,0 +1,8 @@
+{
+    "chat_template": "{%- if tools %}\\n    {{- \'<|im_start|>system\\\\n\' }}\\n    {%- if messages[0].role == \'system\' %}\\n        {{- messages[0].content + \'\\\\n\\\\n\' }}\\n    {%- endif %}\\n    {{- \\"# Tools\\\\n\\\\nYou may call one or more functions to assist with the user query.\\\\n\\\\nYou are provided with function signatures within <tools></tools> XML tags:\\\\n<tools>\\" }}\\n    {%- for tool in tools %}\\n        {{- \\"\\\\n\\" }}\\n        {{- tool | tojson }}\\n    {%- endfor %}\\n    {{- \\"\\\\n</tools>\\\\n\\\\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\\\\n<tool_call>\\\\n{\\\\\\"name\\\\\\": <function-name>, \\\\\\"arguments\\\\\\": <args-json-object>}\\\\n</tool_call><|im_end|>\\\\n\\" }}\\n{%- else %}\\n    {%- if messages[0].role == \'system\' %}\\n        {{- \'<|im_start|>system\\\\n\' + messages[0].content + \'<|im_end|>\\\\n\' }}\\n    {%- endif %}\\n{%- endif %}\\n{%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %}\\n{%- for message in messages[::-1] %}\\n    {%- set index = (messages|length - 1) - loop.index0 %}\\n    {%- if ns.multi_step_tool and message.role == \\"user\\" and message.content is string and not(message.content.startswith(\'<tool_response>\') and message.content.endswith(\'</tool_response>\')) %}\\n        {%- set ns.multi_step_tool = false %}\\n        {%- set ns.last_query_index = index %}\\n    {%- endif %}\\n{%- endfor %}\\n{%- for message in messages %}\\n    {%- if message.content is string %}\\n        {%- set content = message.content %}\\n    {%- else %}\\n        {%- set content = \'\' %}\\n    {%- endif %}\\n    {%- if (message.role == \\"user\\") or (message.role == \\"system\\" and not loop.first) %}\\n        {{- \'<|im_start|>\' + message.role + \'\\\\n\' + content + \'<|im_end|>\' + \'\\\\n\' }}\\n    {%- elif message.role == \\"assistant\\" %}\\n        {{- \'<|im_start|>\' + message.role + \'\\\\n\' + content }}\\n        {%- if message.tool_calls %}\\n            {%- for tool_call in message.tool_calls %}\\n                {%- if (loop.first and content) or (not loop.first) %}\\n                    {{- \'\\\\n\' }}\\n                {%- endif %}\\n                {%- if tool_call.function %}\\n                    {%- set tool_call = tool_call.function %}\\n                {%- endif %}\\n                {{- \'<tool_call>\\\\n{\\"name\\": \\"\' }}\\n                {{- tool_call.name }}\\n                {{- \'\\", \\"arguments\\": \' }}\\n                {%- if tool_call.arguments is string %}\\n                    {{- tool_call.arguments }}\\n                {%- else %}\\n                    {{- tool_call.arguments | tojson }}\\n                {%- endif %}\\n                {{- \'}\\\\n</tool_call>\' }}\\n            {%- endfor %}\\n        {%- endif %}\\n        {{- \'<|im_end|>\\\\n\' }}\\n    {%- elif message.role == \\"tool\\" %}\\n        {%- if loop.first or (messages[loop.index0 - 1].role != \\"tool\\") %}\\n            {{- \'<|im_start|>user\' }}\\n        {%- endif %}\\n        {{- \'\\\\n<tool_response>\\\\n\' }}\\n        {{- content }}\\n        {{- \'\\\\n</tool_response>\' }}\\n        {%- if loop.last or (messages[loop.index0 + 1].role != \\"tool\\") %}\\n            {{- \'<|im_end|>\\\\n\' }}\\n        {%- endif %}\\n    {%- endif %}\\n{%- endfor %}\\n{%- if add_generation_prompt %}\\n    {{- \'<|im_start|>assistant\\\\n\' }}\\n    {%- if enable_thinking is defined and enable_thinking is false %}\\n        {{- \'<think>\\\\n\\\\n</think>\\\\n\\\\n\' }}\\n    {%- endif %}\\n{%- endif %}",
+    "system_message": "A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions.\n\n",
+    "stop_ids": [
+        7
+    ],
+    "end_of_assistant": "<|im_end|>"
+}
diff --git a/applications/ColossalChat/rl_example.py b/applications/ColossalChat/rl_example.py
@@ -131,6 +131,7 @@
         default=1.0,
         help="Top p for sampling. Please check the generation arguments documentation for your backend.",
     )
+    parser.add_argument("-ct", "--chat-template", type=str, default=None, help="Chat template to use for the model.")
     parser.add_argument("-s", "--system-prompt", type=str, default=None, help="System prompt for data construction.")
     parser.add_argument("-mnt", "--max-new-tokens", type=int, default=1024 * 4 - 512, help="Max length for generation.")
     parser.add_argument("-mpt", "--max-prompt-tokens", type=int, default=512, help="Max length for prompt.")
@@ -427,11 +428,20 @@
                 "llm_call_budget": 10,
                 "max_tokens": 2048,
             }
+            grpo_config["forced_patterns"] = [
+                r"<tool_response>\n.+\n</tool_response>"
+            ]  # force at least one correct tool call
         else:
             raise ValueError(f"Unsupported agentic model type: {args.agentic_type}")
     else:
         agentic_config = None
 
+    tokenizer_config = {
+        "path": args.model,
+        "trust_remote_code": True,
+        "chat_template": args.chat_template,
+    }
+
     launch_distributed(
         num_producers=args.num_inferencer,
         num_proc_per_producer=inference_model_config.get("tensor_parallel_size", args.producer_tensor_parallel_size)
@@ -453,6 +463,7 @@
         train_model_config=train_model_config,
         grpo_config=grpo_config,
         agentic_config=agentic_config,
+        tokenizer_config=tokenizer_config,
         plugin_config={
             "tp_size": args.tensor_parallel_size,
             "pp_size": args.pipeline_parallel_size,
@@ -480,7 +491,7 @@
         eval_interval=args.eval_interval,
         eval_save_dir=os.path.join(args.eval_save_dir, args.project.replace(" ", "_")),
         eval_generation_config=eval_generation_config,
-        log_rollout_interval=20,
+        log_rollout_interval=1,
         rollout_save_dir=args.rollout_save_dir,
         enable_profiling=args.enable_profiling,
         n_behind=args.n_behind,