remove debug lines and fix pre-commit error

hiyuchang · hiyuchang · commit dc2153ac76ec · 2025-11-13T02:30:35.000Z
diff --git a/trinity/algorithm/policy_loss_fn/ppo_policy_loss.py b/trinity/algorithm/policy_loss_fn/ppo_policy_loss.py
@@ -19,6 +19,7 @@ def __init__(
         clip_range: Optional[float] = None,
         clip_range_low: Optional[float] = None,
         clip_range_high: Optional[float] = None,
+        clip_ratio_c: Optional[float] = 3.0,
         loss_agg_mode: Optional[str] = "token-mean",
     ) -> None:
         super().__init__(backend=backend)
@@ -30,8 +31,13 @@ def __init__(
             self.clip_range_high = clip_range
         else:
             self.clip_range_high = clip_range_high
+        self.clip_ratio_c = clip_ratio_c
         assert self.clip_range_low is not None, "clip_range_low must be specified."
         assert self.clip_range_high is not None, "clip_range_high must be specified."
+        assert self.clip_ratio_c is not None and self.clip_ratio_c > 1.0, (
+            "The lower bound of the clip_ratio_c for dual-clip PPO should be greater than 1.0,"
+            + f" but get the value: {clip_ratio_c}."
+        )
         self.loss_agg_mode = loss_agg_mode
 
     def __call__(  # type: ignore
@@ -43,28 +49,38 @@ def __call__(  # type: ignore
         **kwargs,
     ) -> Tuple[torch.Tensor, Dict]:
         negative_approx_kl = logprob - old_logprob
+        negative_approx_kl = torch.clamp(negative_approx_kl, min=-20.0, max=20.0)
         ratio = torch.exp(negative_approx_kl)
         ppo_kl = masked_mean(-negative_approx_kl, action_mask)
 
-        pg_losses = -advantages * ratio
+        pg_losses1 = -advantages * ratio
         pg_losses2 = -advantages * torch.clamp(
             ratio, 1.0 - self.clip_range_low, 1.0 + self.clip_range_high  # type: ignore
         )
+        clip_pg_losses1 = torch.maximum(pg_losses1, pg_losses2)
+        pg_clipfrac = masked_mean(torch.gt(pg_losses2, pg_losses1).float(), action_mask)
 
-        pg_loss = masked_loss(
-            torch.max(pg_losses, pg_losses2), action_mask, loss_agg_mode=self.loss_agg_mode
+        pg_losses3 = -advantages * self.clip_ratio_c
+        clip_pg_losses2 = torch.min(pg_losses3, clip_pg_losses1)
+        pg_clipfrac_lower = masked_mean(
+            torch.gt(clip_pg_losses1, pg_losses3) * (advantages < 0).float(), action_mask
         )
-        pg_clipfrac = masked_mean(torch.gt(pg_losses2, pg_losses).float(), action_mask)
+
+        pg_losses = torch.where(advantages < 0, clip_pg_losses2, clip_pg_losses1)
+        pg_loss = masked_loss(pg_losses, action_mask, loss_agg_mode=self.loss_agg_mode)
+
         metrics = {
             "pg_clipfrac": pg_clipfrac.detach().item(),
             "ppo_kl": ppo_kl.detach().item(),
             "pg_loss": pg_loss.detach().item(),
+            "pg_clipfrac_lower": pg_clipfrac_lower.detach().item(),
         }
         return pg_loss, metrics
 
     @classmethod
     def default_args(cls) -> Dict:
         return {
             "clip_range": 0.2,
+            "clip_ratio_c": 3.0,
             "loss_agg_mode": "token-mean",
         }
diff --git a/trinity/common/config.py b/trinity/common/config.py
@@ -926,7 +926,6 @@ def _check_trainer_input(self) -> None:
         experience_buffer.batch_size = self.buffer.train_batch_size
         experience_buffer.tokenizer_path = self.model.model_path
         set_if_none(experience_buffer, "ray_namespace", self.ray_namespace)
-        # TODO: this cannot apply chat_template_path, as check_model is later than this line
         set_if_none(experience_buffer.format, "chat_template", self.model.custom_chat_template)
         for aux_name, aux_buffer in trainer_input.auxiliary_buffers.items():
             aux_buffer.batch_size = self.buffer.train_batch_size
@@ -1069,7 +1068,7 @@ def _check_model(self) -> None:
             model.critic_model_path = model.model_path
 
         # check template
-        if model.chat_template_path:
+        if model.chat_template_path and model.custom_chat_template is None:
             with open(model.chat_template_path, "r") as f:
                 model.custom_chat_template = f.read()
 
diff --git a/trinity/common/models/vllm_model.py b/trinity/common/models/vllm_model.py
@@ -370,19 +370,14 @@ async def convert_messages_to_experience(
             enable_thinking=self.enable_thinking,
         )  # (seq_length, ), (seq_length, )
         logprobs = await self.logprobs(token_ids=token_ids.tolist())  # (seq_length - 1,)
-        exp = Experience(
+        return Experience(
             tokens=token_ids,
             logprobs=logprobs[prompt_length - 1 :],
             prompt_length=prompt_length,
             action_mask=action_mask[prompt_length:],  # Exclude the prompt
             prompt_text=self.tokenizer.decode(token_ids[:prompt_length]),
             response_text=self.tokenizer.decode(token_ids[prompt_length:]),
         )
-        import torch 
-        torch.set_printoptions(threshold=float('inf'))
-        print(f"!!!Debug: {exp.tokens=} {exp.logprobs=} {exp.prompt_length=} {exp.action_mask=} {exp.prompt_text=} {exp.response_text=}")
-        print("sum(action_mask): ", torch.sum(exp.action_mask))
-        return exp
 
     async def shutdown(self):
         """Shutdown the vLLM v1 engine. This kills child processes forked
diff --git a/trinity/common/workflows/envs/frozen_lake/workflow.py b/trinity/common/workflows/envs/frozen_lake/workflow.py
@@ -100,15 +100,13 @@ def __init__(
         self.use_multistep_prompt = workflow_args.get("use_multistep_prompt", False)
         self.desc = workflow_args.get("desc", None)
         self.is_slippery = workflow_args.get("is_slippery", False)
-        print(f"{self.rollout_args =}")
         self.max_response_tokens = self.rollout_args.get("max_response_tokens", 10240)
 
         # Extract task-specific arguments
         self.raw_task = task.raw_task if hasattr(task, "raw_task") else {}
         self.size = self.raw_task.get("size", 1)
         self.p = self.raw_task.get("p", 0.8)
         self.seed = self.raw_task.get("seed", 42)
-        print("self.size: ", self.size, "self.p: ", self.p, "self.seed: ", self.seed)
 
         if self.desc is None:
             random_map, goal_position = generate_random_map(
@@ -241,11 +239,17 @@ def render(self, mode="tiny_rgb_array"):
         room_state = self.render(mode="state").tolist()
 
         if mode == "list":
-            lookup = lambda cell: GRID_LOOKUP.get(cell, "?").strip("\t").strip()
+
+            def lookup(cell):
+                return GRID_LOOKUP.get(cell, "?").strip("\t").strip()
+
             return [" ".join(lookup(cell) for cell in row) for row in room_state]
 
         if mode == "tiny_rgb_array":
-            lookup = lambda cell: GRID_LOOKUP.get(cell, "?")
+
+            def lookup(cell):
+                return GRID_LOOKUP.get(cell, "?")
+
             result = "\n".join("".join(lookup(cell) for cell in row) for row in room_state)
             return result
 
@@ -271,7 +275,6 @@ async def run_async(self) -> List[Experience]:
 
         # Run episode until done or max_steps reached
         for step in range(self.max_steps):
-            print("Current step: ", step)
             # Format observation for the model
             current_obs_str = str(self.current_observation)
             user_prompt_content = (
@@ -301,11 +304,9 @@ async def run_async(self) -> List[Experience]:
             else:
                 response_token_len = messages_token_len - init_prompt_token_len
                 max_tokens = self.max_response_tokens - response_token_len
-            print(
-                f"!!!Debug: {max_tokens=} used_response_tokens = {self.max_response_tokens-max_tokens} {messages_token_len=} {init_prompt_token_len=}"
-            )
 
             if max_tokens <= 0:
+                # messages = messages[:-1] # TODO: apply this?
                 self.done = False
                 self.final_reward = 0
                 break
@@ -314,13 +315,9 @@ async def run_async(self) -> List[Experience]:
             rollout_args = self.rollout_args.copy()
             rollout_args["n"] = 1
             rollout_args["max_tokens"] = max_tokens
-            # print("Current step: ", step, rollout_args)
             responses = await self.model.chat_async(messages, **rollout_args)
             response_text = responses[0].response_text
             messages.append({"role": "assistant", "content": response_text})
-            print(
-                "raw response: ", response_text
-            )  # sometimes has <think></think> and <action>, somtimes not
 
             # Parse action from response
             _, action_str = self._parse_model_response(response_text)
@@ -349,15 +346,6 @@ async def run_async(self) -> List[Experience]:
                 "success": 1 if self.final_reward == 1.0 else 0,
             },
         )
-        print("\n\n\n")
-        print("full messages: ", messages)
-        # print("experience.tokens: ", len(experience.tokens))
-        # print("experience.logprobs: ", len(experience.logprobs))
-        # print("experience.action_mask: ", len(experience.action_mask))
-        # print("experience.prompt_length: ", experience.prompt_length)
-        # print("experience.reward: ", experience.reward)
-        # print("experience.prompt_text: ", experience.prompt_text)
-        # print("experience.response_text: ", experience.response_text, "\n\n\n")
         return [experience]
 
     def _parse_model_response(self, response: str) -> tuple[str, str]:
diff --git a/trinity/common/workflows/step_wise_workflow.py b/trinity/common/workflows/step_wise_workflow.py
@@ -3,9 +3,6 @@
 from trinity.common.experience import Experience
 from trinity.common.models.model import ModelWrapper
 from trinity.common.workflows.workflow import Task, Workflow
-from trinity.utils.log import get_logger
-
-logger = get_logger(__name__)  # TODO: delete this after debugging
 
 
 class StepWiseRewardWorkflow(Workflow):
@@ -149,7 +146,6 @@ def run(self) -> list[Experience]:
             experiences.extend(exps)
             if not continue_run:
                 break
-        logger.info(f"Experiences[0]: {experiences[0].response_text}")
         reward = self.reward(experiences)
         for exp in experiences:
             exp.reward = reward
diff --git a/trinity/common/workflows/workflow.py b/trinity/common/workflows/workflow.py
@@ -168,11 +168,6 @@ def set_repeat_times(self, repeat_times, run_id_base):
     def process_messages_to_experience(self, messages, reward, info={}) -> Experience:
         converted_experience = self.model.convert_messages_to_experience(messages)
 
-        if converted_experience.info.get("is_truncated", False):
-            print(f"!!!Debug: a truncation experience with reward {reward} is generated")
-            # TODO: handle this case
-            reward = 0
-
         tokens = converted_experience.tokens
         log_probs = converted_experience.logprobs
         assert converted_experience.action_mask is not None