feat: grpo async generate thread-safe queue production (#3821)

hjh0119 · web-flow · commit 59a3863551f0 · 2025-04-10T19:58:34.000+08:00
* lock

* remove lock

* fix

* fix

* move comment

* fix

---------

Co-authored-by: hjh &lt;hujinghan.hjh@alibaba-inc.com&gt;
diff --git a/swift/trainers/rlhf_trainer/grpo_trainer.py b/swift/trainers/rlhf_trainer/grpo_trainer.py
@@ -321,6 +321,8 @@ def cyclic_iter(iterable):
                         yield x
 
             self.resample_iterator = cyclic_iter(self.get_resample_dataloader())
+        # flag indicating whether the evaluation has started
+        self.eval_flag = False
 
     def split_batches(self):
         """Sync weights in batches
@@ -1089,6 +1091,10 @@ def _get_per_token_logps(self, model, inputs):
         return selective_log_softmax(logits, input_ids)  # compute logprobs for the input tokens
 
     def evaluation_loop(self, dataloader, *args, **kwargs):
+        # Wait for the training rollout to complete
+        if self.args.async_generate:
+            while not self.is_async_generate_eval_rollout_done():
+                time.sleep(0.1)
         # set mini_batch_size None in evaluation
         mini_batch_size = self.args.mini_batch_size
         self.args.mini_batch_size = None
@@ -1099,13 +1105,17 @@ def evaluation_loop(self, dataloader, *args, **kwargs):
         metrics = {f'{metric_key_prefix}_{key}': sum(val) / len(val) for key, val in self._metrics['eval'].items()}
         output.metrics.update(metrics)
         self.args.mini_batch_size = mini_batch_size
+        self.eval_flag = True
         return output
 
     def training_step(self,
                       model: nn.Module,
                       inputs: Dict[str, Union[torch.Tensor, Any]],
                       num_items_in_batch=None) -> torch.Tensor:
-
+        if self.args.async_generate:
+            # Wait for the eval rollout to complete
+            while not self.is_async_generate_eval_rollout_done():
+                time.sleep(0.1)
         if self.args.mini_batch_size is None:
             return super().training_step(model, inputs, num_items_in_batch)
         model.train()
@@ -1326,3 +1336,9 @@ def log(self, logs: dict[str, float], start_time: Optional[float] = None) -> Non
                 if self.args.wandb_log_unique_prompts:
                     df = df.drop_duplicates(subset=['prompt'])
                 wandb.log({'completions': wandb.Table(dataframe=df)})
+
+    def is_async_generate_eval_rollout_done(self):
+        return not self.eval_flag or not self.eval_queue.empty()
+
+    def is_async_generate_train_rollout_done(self):
+        return not self.train_queue.empty()