hpcaitech
diff --git a/‎applications/ColossalChat/coati/distributed/consumer.py‎
Lines changed: 23 additions & 2 deletions b/‎applications/ColossalChat/coati/distributed/consumer.py‎
Lines changed: 23 additions & 2 deletions
diff --git a/‎applications/ColossalChat/coati/distributed/grpo_consumer.py‎
Lines changed: 7 additions & 1 deletion b/‎applications/ColossalChat/coati/distributed/grpo_consumer.py‎
Lines changed: 7 additions & 1 deletion
diff --git a/‎applications/ColossalChat/coati/distributed/launch_zero_bubble.py‎
Lines changed: 0 additions & 2 deletions b/‎applications/ColossalChat/coati/distributed/launch_zero_bubble.py‎
Lines changed: 0 additions & 2 deletions
diff --git a/‎applications/ColossalChat/coati/distributed/producer.py‎
Lines changed: 22 additions & 0 deletions b/‎applications/ColossalChat/coati/distributed/producer.py‎
Lines changed: 22 additions & 0 deletions
diff --git a/‎applications/ColossalChat/coati/distributed/reward/reward_fn.py‎
Lines changed: 18 additions & 1 deletion b/‎applications/ColossalChat/coati/distributed/reward/reward_fn.py‎
Lines changed: 18 additions & 1 deletion
diff --git a/‎applications/ColossalChat/coati/distributed/reward/reward_utils.py‎
Lines changed: 51 additions & 1 deletion b/‎applications/ColossalChat/coati/distributed/reward/reward_utils.py‎
Lines changed: 51 additions & 1 deletion
@@ -55,6 +55,7 @@ def __init__(
         self.enable_profiling = enable_profiling
         assert batch_size % minibatch_size == 0, "batch_size should be divisible by microbatch_size"
         self.num_microbatches = batch_size // minibatch_size
+        self.checkpoint_path = model_config.pop("checkpoint_path", None)
 
         self.model_config = model_config
         self.plugin_config = plugin_config
@@ -143,6 +144,26 @@ def calculate_effective_group_to_raw_group_mapping(self, step):
         return effective_group_to_raw_group_mapping
 
     def loop(self) -> None:
+        self.profiler.enter("sync_model")
+        torch.cuda.empty_cache()
+        state_dict = self.state_dict()
+        if self.pp_size > 1:
+            if self.tp_rank == 0 and self.dp_rank == 0:
+                ray_broadcast_tensor_dict(
+                    state_dict,
+                    src=self.num_producers,
+                    device=self.device,
+                    group_name=f"sync_model_{self.pp_rank}",
+                )
+        else:
+            if self.rank == 0:
+                ray_broadcast_tensor_dict(
+                    state_dict, src=self.num_producers, device=self.device, group_name="sync_model"
+                )
+        del state_dict
+        torch.cuda.empty_cache()
+        self.profiler.exit("sync_model")
+
         print(
             f"Consumer{self.rank} num_update: {self.num_update_per_episode}, num_recv: {self.num_recv_per_update}, nmb: {self.num_microbatches}"
         )
@@ -286,7 +307,7 @@ def loop(self) -> None:
                         if self.rank == 0:
                             print(f"Start saving policy model at step {step + 1}.")
                         save_path = os.path.join(self.save_dir, f"modeling-episode-{episode}-step-{step + 1}")
-                        self.booster.save_model(self.policy_model, save_path, shard=True)
+                        self.booster.save_model(self.policy_model, save_path, shard=True, use_safetensors=True)
                         if self.rank == 0:
                             print(f"Saved model checkpoint at step {step + 1} in folder {save_path}")
 
@@ -365,7 +386,7 @@ def __init__(
         self.model = AutoModelForCausalLM.from_pretrained(path, **model_config)
         self.model.train()
         self.model.gradient_checkpointing_enable()
-        self.optimizer = HybridAdam(self.model.parameters(), lr=1e-3)
+        self.optimizer = HybridAdam(self.model.parameters(), lr=1e-3, weight_decay=0.01)
         self.accum_loss = torch.zeros(1, device=self.device)
 
     def setup(self):
 
@@ -72,7 +72,11 @@ def __init__(
         self.policy_model = AutoModelForCausalLM.from_pretrained(path, **model_config)
         self.policy_model.train()
         self.policy_model.gradient_checkpointing_enable()
-        self.optimizer = HybridAdam(self.policy_model.parameters(), lr=grpo_config.get("lr", 1e-6))
+        self.optimizer = HybridAdam(
+            self.policy_model.parameters(),
+            lr=grpo_config.get("lr", 1e-6),
+            weight_decay=grpo_config.get("weight_decay", 0.01),
+        )
         self.accum_loss = torch.zeros(1, device=self.device)
         self.accum_kl = torch.zeros(1, device=self.device)
         self.accum_entropy = torch.zeros(1, device=self.device)
@@ -153,6 +157,8 @@ def setup(self):
         )
         if self.policy_loss_fn.beta > 0:
             self.reference_model, *_ = self.booster.boost(self.reference_model)
+        if self.checkpoint_path is not None:
+            self.booster.load_model(self.policy_model, self.checkpoint_path)
         self.plugin.logger.set_level("ERROR")
 
     def step(self, step_idx: int, pbar: Any, **kwargs) -> Optional[float]:
 
@@ -55,7 +55,6 @@ def launch_distributed(
     eval_dataset_config: Optional[Dict[str, Any]] = None,
     eval_interval: int = 100,
     eval_save_dir: Optional[str] = None,
-    eval_generation_config: Optional[Dict[str, Any]] = None,
     log_rollout_interval: int = 20,
     rollout_save_dir: str = "./rollout",
     enable_profiling: bool = False,
@@ -139,7 +138,6 @@ def launch_distributed(
             eval_interval=eval_interval,
             grpo_config=grpo_config,
             eval_save_dir=eval_save_dir,
-            eval_generation_config=eval_generation_config,
             project_name=project_name,
             run_name=run_name,
             wandb_group_name=wandb_group_name,
 
@@ -203,6 +203,28 @@ def load_state_dict(self, state_dict: Dict[str, torch.Tensor]) -> None:
         raise NotImplementedError
 
     def loop(self) -> None:
+
+        torch.cuda.empty_cache()
+        self.profiler.enter("sync_model")
+        if self.consumer_pp_size > 1:
+            for pp_idx in range(self.consumer_pp_size):
+                state_dict = ray_broadcast_tensor_dict(
+                    None, self.num_producers, device=self.device, group_name=f"sync_model_{pp_idx}"
+                )
+                if "consumer_global_step" in state_dict:
+                    self.consumer_global_step = state_dict.pop("consumer_global_step").item()
+                self.load_state_dict(state_dict)
+        else:
+            state_dict = ray_broadcast_tensor_dict(
+                None, self.num_producers, device=self.device, group_name="sync_model"
+            )
+            if "consumer_global_step" in state_dict:
+                self.consumer_global_step = state_dict.pop("consumer_global_step").item()
+            self.load_state_dict(state_dict)
+        self.profiler.exit("sync_model")
+        del state_dict
+        torch.cuda.empty_cache()
+
         num_update_per_episode = len(self.train_dataloader) // self.num_microbatches
         num_valid_microbatches = num_update_per_episode * self.num_microbatches
 
 
@@ -25,7 +25,12 @@
 from math_verify import ExprExtractionConfig, LatexExtractionConfig, parse, verify
 
 from .code_reward.utils import check_correctness_code_api as check_correctness_code
-from .reward_utils import extract_boxed_solution, extract_solution, validate_response_structure
+from .reward_utils import (
+    extract_boxed_solution,
+    extract_solution,
+    find_infinite_loop_start,
+    validate_response_structure,
+)
 
 CANNOT_PARSE_GT_ANSWER = -1
 CANNOT_PARSE_PREDICTION = -2
@@ -122,6 +127,8 @@ def math_reward_fn(input_ids, gt_answer, response_idx, **kwargs):
 
     decoded_final_answer = tokenizer.decode(input_ids[s : e + 1], skip_special_tokens=True)
 
+    repetition_reward = 1.0 if detect_repetition(decoded_final_answer) == [] else 0.0
+
     final_answer, processed_str = extract_solution(decoded_final_answer)
 
     format_valid = validate_response_structure(processed_str, kwargs["tags"])
@@ -137,6 +144,10 @@ def math_reward_fn(input_ids, gt_answer, response_idx, **kwargs):
     if format_valid:
         format_acc += 1
 
+    # Add repetition reward
+    if not eval_mode:
+        reward += repetition_reward
+
     # Check if the sequence is over length
     if not eval_mode and res_length >= max_new_tokens:
         reward *= 0.0
@@ -182,6 +193,8 @@ def boxed_math_reward_fn(input_ids, gt_answer, response_idx, **kwargs):
         raise ValueError("no gt_answer is provided, please check your training dataset.")
 
     decoded_final_answer = tokenizer.decode(input_ids[s : e + 1], skip_special_tokens=True)
+    print(f"Decoded final answer: {decoded_final_answer[-500:]}")
+    repetition_score = find_infinite_loop_start(input_ids[s : e + 1], min_repeats=2, distance=False)
 
     final_answer = extract_boxed_solution(decoded_final_answer)
     format_valid = final_answer is not None
@@ -202,6 +215,10 @@ def boxed_math_reward_fn(input_ids, gt_answer, response_idx, **kwargs):
     if format_valid:
         format_acc += 1
 
+    if not repetition_score > 0 and not eval_mode:
+        # award for non-repetition
+        reward += 2
+
     # Check if the sequence is over length
     if not eval_mode and res_length >= max_new_tokens:
         reward *= 0.0
 
@@ -14,7 +14,9 @@
 # limitations under the License.
 
 import re
-from typing import Dict, Optional, Tuple
+from typing import Dict, List, Optional, Tuple
+
+import torch
 
 
 def validate_response_structure(processed_str: str, tags: Dict = None) -> bool:
@@ -122,3 +124,51 @@ def extract_boxed_solution(text: str) -> Optional[str]:
     except Exception:
         # Any other unexpected error
         return None
+
+
+import Levenshtein
+
+
+def is_similar(seq1: List[int], seq2: List[int], threshold: float = 0.9) -> bool:
+    ratio = Levenshtein.ratio(seq1, seq2)
+    return ratio >= threshold
+
+
+def find_infinite_loop_start(token_ids: List[int], min_repeats: int = 2, distance: bool = False) -> float:
+    n = len(token_ids)
+
+    # Step 1: Detect the repeating segment at the end using two pointers
+    longest_valid_length = 0
+    start_of_loop = n
+
+    for length in range(1, n // min_repeats + 1):  # Try different phrase lengths
+        count = 1  # Reset repetition counter
+        right = n - length  # Start comparing from the second last occurrence
+
+        while right - length >= 0:
+            # Check if the current phrase matches the previous phrase
+            if distance:
+                if is_similar(token_ids[right - length : right], token_ids[right : right + length]):
+                    count += 1
+                else:
+                    break  # Stop if repetition is broken
+            else:
+                # Use torch.equal() for tensor comparison
+                if torch.equal(token_ids[right - length : right], token_ids[right : right + length]):
+                    count += 1
+                else:
+                    break  # Stop if repetition is broken
+
+            right -= length  # Move left to check further
+
+        if count >= min_repeats:  # Found a valid repeating phrase
+            longest_valid_length = length
+            start_of_loop = right  # This is where the first cycle of the repetition begins
+
+    if longest_valid_length == 0:
+        return 0.0  # No infinite loop found, return repetition ratio as 0
+
+    # Step 2: Compute the repetition ratio
+    repetition_ratio = (n - start_of_loop) / n
+
+    return repetition_ratio