simplify _run_agentic_pipeline; fix old_log_probs

YeAnbang · YeAnbang · commit 2b46ab1401db · 2025-09-19T10:00:07.000+08:00
diff --git a/applications/ColossalChat/coati/distributed/agent/agentic_producer.py b/applications/ColossalChat/coati/distributed/agent/agentic_producer.py
@@ -224,9 +224,7 @@ def _run_agentic_pipeline(self, messages):
             if llm_call_count > self.llm_call_budget:
                 print(f"LLM call budget exceeded: {llm_call_count} > {self.llm_call_budget}. Stopping.")
                 del self.async_llm_engine_map[request_id]
-                while messages[-1]["role"] == "tool":
-                    messages.pop()
-                return messages, logprobs
+                return messages, response_input_ids, logprobs
             inputs = self._build_prompt(messages, return_dict=True, return_tensors="pt")
             if num_prompt_tokens == 0:
                 num_prompt_tokens = inputs["input_ids"].size(-1)
@@ -235,9 +233,7 @@ def _run_agentic_pipeline(self, messages):
                     f"Max tokens exceeded: Current have generated {inputs['input_ids'].size(-1) - num_prompt_tokens} tokens > {self.generate_config.get('max_tokens', 512)}. Stopping."
                 )
                 del self.async_llm_engine_map[request_id]
-                while messages[-1]["role"] == "tool":
-                    messages.pop()
-                return messages, logprobs
+                return messages, response_input_ids, logprobs
             async_producer = self._select_async_producer(request_id=request_id)
             agentic_generate_config = copy.deepcopy(self.generate_config)
             agentic_generate_config["max_tokens"] = self.agentic_config.get("max_tokens", 2048)
@@ -262,7 +258,7 @@ def _run_agentic_pipeline(self, messages):
                 if tool_call_count > self.tool_call_budget:
                     print(f"Tool call budget exceeded: {tool_call_count} > {self.tool_call_budget}. Stopping.")
                     del self.async_llm_engine_map[request_id]
-                    return messages, logprobs
+                    return messages, response_input_ids, logprobs
                 tool_call_count += len(assistant_message["tool_calls"])
                 handlers = []
                 for tool_call in assistant_message["tool_calls"]:
@@ -277,4 +273,4 @@ def _run_agentic_pipeline(self, messages):
             else:
                 # no further tool call, return the messages
                 del self.async_llm_engine_map[request_id]
-                return messages, logprobs
+                return messages, response_input_ids, logprobs
diff --git a/applications/ColossalChat/coati/distributed/agent/base.py b/applications/ColossalChat/coati/distributed/agent/base.py
@@ -123,32 +123,30 @@ def rollout(self, **kwargs) -> Dict[str, torch.Tensor]:
             )
 
         for i in range(self.num_generations):
-            _messages, logprobs = results[i]
-            response_input_ids = self._build_prompt(
-                _messages, return_dict=True, return_tensors="pt", add_generation_prompt=False
-            )["input_ids"]
+            # due to the multiround feature, action_mask and attention_mask need to be recomputed
+            _messages, response_input_ids, logprobs = results[i]
             # truncate if too long
-            response_input_ids = response_input_ids[:, : self.grpo_config["max_length"] - to_pad_left]
+            response_input_ids = response_input_ids[0, :, : self.grpo_config["max_length"] - to_pad_left]
             # add left right padding
-            to_pad_right = self.grpo_config["max_length"] - response_input_ids.shape[1] - to_pad_left
-            response_length = response_input_ids.shape[1] - prompt_length
+            to_pad_right = self.grpo_config["max_length"] - response_input_ids.size(-1) - to_pad_left
             input_ids = torch.nn.functional.pad(
                 response_input_ids, (to_pad_left, to_pad_right), "constant", value=self.tokenizer.pad_token_id
             )  # [1, max_length]
             attention_mask = input_ids.ne(self.tokenizer.pad_token_id).int()  # [1, max_length]
             action_mask = input_ids[:, max_prompt_length:].ne(self.tokenizer.pad_token_id).int()
+            response_length = action_mask.sum().item()
             rollouts["attention_mask"].append(attention_mask)
             rollouts["action_mask"].append(action_mask)
             truncated_logprobs = logprobs[
-                :, :, prompt_length : prompt_length + self.generate_config["max_tokens"]
+                0, :, prompt_length : prompt_length + self.generate_config["max_tokens"]
             ]  # truncate to max_new_tokens
             logprobs_padded = torch.nn.functional.pad(
                 truncated_logprobs,
                 (0, self.generate_config["max_tokens"] - truncated_logprobs.size(-1)),
                 "constant",
                 value=0.0,
             )  # [1, max_new_tokens]
-            rollouts["action_log_probs"].append(logprobs_padded[0])
+            rollouts["action_log_probs"].append(logprobs_padded)
             rollouts["response_idx"].append(
                 torch.tensor(
                     [
diff --git a/applications/ColossalChat/coati/distributed/loss.py b/applications/ColossalChat/coati/distributed/loss.py
@@ -37,9 +37,9 @@ def forward(
         total_effective_tokens_in_batch: torch.Tensor = None,
     ) -> torch.Tensor:
         if action_mask is None:
-            ratio = (log_probs - log_probs.detach()).exp()
+            ratio = (log_probs - old_log_probs.detach()).exp()
         else:
-            ratio = ((log_probs - log_probs.detach()) * action_mask).exp()
+            ratio = ((log_probs - old_log_probs.detach()) * action_mask).exp()
 
         surr1 = ratio * advantages
         surr2 = ratio.clamp(1 - self.clip_eps_low, 1 + self.clip_eps_high) * advantages
diff --git a/applications/ColossalChat/rl_example.py b/applications/ColossalChat/rl_example.py
@@ -429,18 +429,16 @@
                 "max_tokens": 2048,
             }
             grpo_config["forced_patterns"] = [
-                r"<tool_response>\n.+\n</tool_response>"
+                r"<tool_response>\n.+\n</tool_response>"  # please modify based on your tool response format
             ]  # force at least one correct tool call
         else:
             raise ValueError(f"Unsupported agentic model type: {args.agentic_type}")
     else:
         agentic_config = None
 
-    tokenizer_config = {
-        "path": args.model,
-        "trust_remote_code": True,
-        "chat_template": args.chat_template,
-    }
+    tokenizer_config = {"path": args.model, "trust_remote_code": True}
+    if args.chat_template is not None:
+        tokenizer_config["chat_template"] = args.chat_template
 
     launch_distributed(
         num_producers=args.num_inferencer,