Merge pull request #6208 from hpcaitech/grpo_dev

YeAnbang · web-flow · commit b9e60559b835 · 2025-02-20T21:23:16.000+08:00
[Chat] fix colossalchat bugs
diff --git a/applications/ColossalChat/coati/experience_maker/naive.py b/applications/ColossalChat/coati/experience_maker/naive.py
@@ -140,7 +140,7 @@ def make_experience(
         num_actions = 0
 
         for inference_mini_batch_id in range(0, input_ids.size(0), self.inference_batch_size):
-            s, e = inference_mini_batch_id, (inference_mini_batch_id + 1) * self.inference_batch_size
+            s, e = inference_mini_batch_id, inference_mini_batch_id + self.inference_batch_size
             if input_ids[s:e].size(0) == 0:
                 break
             sequences = generate(self.actor, input_ids[s:e], self.tokenizer, **generate_kwargs)
diff --git a/applications/ColossalChat/coati/trainer/dpo.py b/applications/ColossalChat/coati/trainer/dpo.py
@@ -380,8 +380,8 @@ def _criterion(outputs, inputs):
                             self.accumulative_meter.get("accuracy"),
                             global_step,
                         )
-                    self.num_train_step += 1
                     self.accumulative_meter.reset()
+                self.num_train_step += 1
 
             if self.save_dir is not None and self.num_train_step > 0 and self.num_train_step % self.save_interval == 0:
                 # save checkpoint
diff --git a/applications/ColossalChat/coati/trainer/grpo.py b/applications/ColossalChat/coati/trainer/grpo.py
@@ -231,7 +231,6 @@ def _training_step(self, experience: Experience):
             experience:
                 sequences: [batch_size, prompt_length + response_length] --- <PAD>...<PAD><PROMPT>...<PROMPT><RESPONSE>...<RESPONSE><PAD>...<PAD>
         """
-        self.num_train_step += 1
         self.actor.train()
         num_actions = experience.action_log_probs.size(1)
         # policy loss
@@ -294,7 +293,7 @@ def _training_step(self, experience: Experience):
                 self.temperature_annealing_scheduler.step_forward()
 
             # preparing logging model output and corresponding rewards.
-            if self.num_train_step % 10 == 1:
+            if self.num_train_step % 10 == 0:
                 response_text = self.experience_maker.tokenizer.batch_decode(
                     experience.sequences, skip_special_tokens=True
                 )
@@ -327,6 +326,7 @@ def _training_step(self, experience: Experience):
                 self.writer.add_scalar("approx_kl", self.accumulative_meter.get("kl"), global_step)
                 self.writer.add_scalar("advantages", self.accumulative_meter.get("advantages"), global_step)
             self.accumulative_meter.reset()
+        self.num_train_step += 1
 
     def _learn(self, update_step: int):
         """
diff --git a/applications/ColossalChat/coati/trainer/kto.py b/applications/ColossalChat/coati/trainer/kto.py
@@ -256,7 +256,7 @@ def _train(self, epoch: int):
                     self.coordinator.print_on_master(
                         f"Saved checkpoint at epoch {epoch} step {self.save_interval} at folder {self.save_dir}"
                     )
-                self.num_train_step += 1
+            self.num_train_step += 1
 
         step_bar.close()
 
diff --git a/applications/ColossalChat/coati/trainer/orpo.py b/applications/ColossalChat/coati/trainer/orpo.py
@@ -233,7 +233,7 @@ def _train(self, epoch: int):
                     self.coordinator.print_on_master(
                         f"Saved checkpoint at epoch {epoch} step {self.save_interval} at folder {self.save_dir}"
                     )
-                self.num_train_step += 1
+            self.num_train_step += 1
 
         step_bar.close()
 
diff --git a/applications/ColossalChat/coati/trainer/ppo.py b/applications/ColossalChat/coati/trainer/ppo.py
@@ -220,7 +220,6 @@ def _training_step(self, experience: Experience):
             experience:
                 sequences: [batch_size, prompt_length + response_length] --- <PAD>...<PAD><PROMPT>...<PROMPT><RESPONSE>...<RESPONSE><PAD>...<PAD>
         """
-        self.num_train_step += 1
         self.actor.train()
         self.critic.train()
         num_actions = experience.action_log_probs.size(1)
@@ -294,7 +293,7 @@ def _training_step(self, experience: Experience):
             self.critic_scheduler.step()
 
             # preparing logging model output and corresponding rewards.
-            if self.num_train_step % 10 == 1:
+            if self.num_train_step % 10 == 0:
                 response_text = self.experience_maker.tokenizer.batch_decode(
                     experience.sequences, skip_special_tokens=True
                 )
@@ -336,6 +335,7 @@ def _training_step(self, experience: Experience):
                 self.writer.add_scalar("value", self.accumulative_meter.get("value"), self.num_train_step)
                 self.writer.add_scalar("advantages", self.accumulative_meter.get("advantages"), self.num_train_step)
             self.accumulative_meter.reset()
+        self.num_train_step += 1
 
     def _learn(self, update_step: int):
         """
diff --git a/applications/ColossalChat/coati/trainer/rm.py b/applications/ColossalChat/coati/trainer/rm.py
@@ -193,7 +193,7 @@ def _train(self, epoch):
                     self.coordinator.print_on_master(
                         f"Saved checkpoint at epoch {epoch} step {(i + 1)/self.accumulation_steps} at folder {self.save_dir}"
                     )
-                self.num_train_step += 1
+            self.num_train_step += 1
         step_bar.close()
 
     def _eval(self, epoch):
diff --git a/applications/ColossalChat/coati/trainer/sft.py b/applications/ColossalChat/coati/trainer/sft.py
@@ -152,9 +152,9 @@ def _train(self, epoch: int):
                     if self.writer:
                         self.writer.add_scalar("train/loss", self.accumulative_meter.get("loss"), global_step)
                         self.writer.add_scalar("train/lr", self.scheduler.get_last_lr()[0], global_step)
-                    self.num_train_step += 1
                     self.accumulative_meter.reset()
                     step_bar.update()
+                self.num_train_step += 1
 
             # Save checkpoint
             if (

Original file line number	Diff line number	Diff line change
`@@ -380,8 +380,8 @@ def _criterion(outputs, inputs):`
`380`	`380`	`self.accumulative_meter.get("accuracy"),`
`381`	`381`	`global_step,`
`382`	`382`	`)`
`383`		`- self.num_train_step += 1`
`384`	`383`	`self.accumulative_meter.reset()`
	`384`	`+ self.num_train_step += 1`
`385`	`385`
`386`	`386`	`if self.save_dir is not None and self.num_train_step > 0 and self.num_train_step % self.save_interval == 0:`
`387`	`387`	`# save checkpoint`
Original file line number	Diff line number	Diff line change
`@@ -256,7 +256,7 @@ def _train(self, epoch: int):`
`256`	`256`	`self.coordinator.print_on_master(`
`257`	`257`	`f"Saved checkpoint at epoch {epoch} step {self.save_interval} at folder {self.save_dir}"`
`258`	`258`	`)`
`259`		`- self.num_train_step += 1`
	`259`	`+ self.num_train_step += 1`
`260`	`260`
`261`	`261`	`step_bar.close()`
`262`	`262`
Original file line number	Diff line number	Diff line change
`@@ -233,7 +233,7 @@ def _train(self, epoch: int):`
`233`	`233`	`self.coordinator.print_on_master(`
`234`	`234`	`f"Saved checkpoint at epoch {epoch} step {self.save_interval} at folder {self.save_dir}"`
`235`	`235`	`)`
`236`		`- self.num_train_step += 1`
	`236`	`+ self.num_train_step += 1`
`237`	`237`
`238`	`238`	`step_bar.close()`
`239`	`239`
Original file line number	Diff line number	Diff line change
`@@ -193,7 +193,7 @@ def _train(self, epoch):`
`193`	`193`	`self.coordinator.print_on_master(`
`194`	`194`	`f"Saved checkpoint at epoch {epoch} step {(i + 1)/self.accumulation_steps} at folder {self.save_dir}"`
`195`	`195`	`)`
`196`		`- self.num_train_step += 1`
	`196`	`+ self.num_train_step += 1`
`197`	`197`	`step_bar.close()`
`198`	`198`
`199`	`199`	`def _eval(self, epoch):`