feat: rlhf generation samples log to swanlab (#4907)

Zeyi-Lin · web-flow · commit 1e8727f6b968 · 2025-07-11T13:56:11.000+08:00
* visualize samples add swanlab

* fix

* fix lint

* patch_profiling_context

* patch_profiling_decorator
diff --git a/swift/trainers/rlhf_trainer/grpo_trainer.py b/swift/trainers/rlhf_trainer/grpo_trainer.py
@@ -27,7 +27,6 @@
 from transformers import PreTrainedModel, TrainerCallback
 from transformers.trainer import Trainer
 from trl import GRPOTrainer as HFGRPOTrainer
-from trl.extras.profiling import profiling_context, profiling_decorator
 from trl.models import prepare_deepspeed
 from trl.trainer.callbacks import SyncRefModelCallback
 from trl.trainer.grpo_trainer import nanmax, nanmin, nanstd
@@ -39,11 +38,12 @@
 from swift.llm.template.template_inputs import StdTemplateInputs
 from swift.plugin import loss_scale_map, multi_turns, orms, rm_plugins
 from swift.plugin.multi_turn import MultiTurnScheduler
-from swift.utils import (JsonlWriter, empty_cache, get_current_device, get_device, get_logger, is_vllm_available,
-                         is_wandb_available, seed_worker, unwrap_model_for_generation)
+from swift.utils import (JsonlWriter, empty_cache, get_current_device, get_device, get_logger, is_swanlab_available,
+                         is_vllm_available, is_wandb_available, seed_worker, unwrap_model_for_generation)
 from ..mixin import SwiftMixin
 from .rlhf_mixin import RLHFTrainerMixin
-from .utils import _ForwardRedirection, patch_lora_merge, patch_lora_unmerge
+from .utils import (_ForwardRedirection, patch_lora_merge, patch_lora_unmerge, patch_profiling_context,
+                    patch_profiling_decorator)
 from .vllm_client import VLLMClient
 
 del HFGRPOTrainer.__init__
@@ -52,6 +52,8 @@
 logger = get_logger()
 if is_wandb_available():
     import wandb
+if is_swanlab_available():
+    import swanlab
 
 InputsType = List[Dict[str, Union[torch.Tensor, Any]]]
 # tuple: (messages, finish_reason)
@@ -325,7 +327,7 @@ def cyclic_iter(iterable):
         # flag indicating whether the evaluation has started
         self.eval_flag = False
 
-    @profiling_decorator
+    @patch_profiling_decorator
     def _prepare_inputs(self, generation_batch: dict[str, Union[torch.Tensor,
                                                                 Any]]) -> dict[str, Union[torch.Tensor, Any]]:
         # Prepares inputs for model training/evaluation by managing completion generation and batch handling.
@@ -479,7 +481,7 @@ def _template_context(self, template: Template):
             template.set_mode(mode)
             template.max_length = max_length
 
-    @profiling_decorator
+    @patch_profiling_decorator
     def _move_model_to_vllm(self, skip_async_check=False):
         deepspeed_plugin = self.accelerator.state.deepspeed_plugin
         zero_stage_3 = deepspeed_plugin is not None and deepspeed_plugin.zero_stage == 3
@@ -906,7 +908,7 @@ def _score_completions(self, inputs: InputsType) -> Tuple[torch.Tensor, torch.Te
 
         for i, (reward_func, reward_model_plugin, reward_func_name) in enumerate(
                 zip(self.reward_funcs, self.reward_model_plugins, self.reward_func_names)):
-            with profiling_context(self, reward_func_name):
+            with patch_profiling_context(self, reward_func_name):
                 # reward model
                 if isinstance(reward_func, nn.Module):
                     output_reward_func = reward_model_plugin(inputs=inputs)
@@ -1110,7 +1112,7 @@ def _apply_chat_template_to_messages_list(self, messages_list: InputsType):
             prompts_text.append(''.join(processed_context))
         return prompts_text
 
-    @profiling_decorator
+    @patch_profiling_decorator
     def compute_loss(self, model, inputs, return_outputs=False, num_items_in_batch=None):
         # Compute the per-token log probabilities for the model, return_outputs=True in mini-batch training
         if isinstance(inputs, list):
@@ -1275,7 +1277,7 @@ def _padding_free_output_hook(module, args, kwargs, result):
             remove_handle2.remove()
 
     # Get the per-token log probabilities for the completions for the model and the reference model
-    @profiling_decorator
+    @patch_profiling_decorator
     def _get_per_token_logps(self, model, inputs):
         from trl.trainer.utils import selective_log_softmax
         logits_to_keep = inputs['logits_to_keep']
@@ -1305,7 +1307,7 @@ def _get_per_token_logps(self, model, inputs):
         input_ids = input_ids[:, -logits_to_keep:]
         return selective_log_softmax(logits, input_ids)  # compute logprobs for the input tokens
 
-    @profiling_decorator
+    @patch_profiling_decorator
     def _get_last_hidden_state(self, unwrapped_model, inputs, logits_to_keep):
         # unwrap the model to access the model.model
         if is_peft_model(unwrapped_model):
@@ -1399,7 +1401,7 @@ def _engine_infer(
         *,
         use_tqdm: Optional[bool] = False,
     ) -> List[ChatCompletionResponse]:
-        with profiling_context(self, 'generate'):
+        with patch_profiling_context(self, 'generate'):
             if self.vllm_mode == 'server':
                 request_keys = ['messages', 'images', 'audios', 'videos', 'tools', 'objects']
 
@@ -1586,6 +1588,16 @@ def log(self, logs: dict[str, float], start_time: Optional[float] = None) -> Non
                     df = df.drop_duplicates(subset=['prompt'])
                 wandb.log({'completions': wandb.Table(dataframe=df)})
 
+            if self.args.report_to and 'swanlab' in self.args.report_to and swanlab.get_run() is not None:
+                headers = list(table.keys())
+                rows = []
+                for i in range(len(table['step'])):
+                    row = []
+                    for header in headers:
+                        row.append(table[header][i])
+                    rows.append(row)
+                swanlab.log({'completions': swanlab.echarts.Table().add(headers, rows)})
+
     def is_async_generate_eval_rollout_done(self):
         return not self.eval_flag or not self.eval_queue.empty()
 
diff --git a/swift/trainers/rlhf_trainer/reward_trainer.py b/swift/trainers/rlhf_trainer/reward_trainer.py
@@ -77,3 +77,10 @@ def visualize_samples(self, num_print_samples: int):
 
                 if wandb.run is not None:
                     wandb.log({'completions': wandb.Table(dataframe=df)})
+
+            if 'swanlab' in self.args.report_to:
+                import swanlab
+                if swanlab.get_run() is not None:
+                    swanlab_table = swanlab.echarts.Table()
+                    swanlab_table.add(headers=df.columns.tolist(), rows=df.values.tolist())
+                    swanlab.log({'completions': swanlab_table})
diff --git a/swift/trainers/rlhf_trainer/utils.py b/swift/trainers/rlhf_trainer/utils.py
@@ -1,4 +1,6 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
+import functools
+import time
 from contextlib import contextmanager
 from types import MethodType
 from typing import Any, Optional
@@ -8,6 +10,13 @@
 from peft.tuners.lora import LoraLayer
 from torch import nn
 
+from swift.utils import is_swanlab_available, is_wandb_available
+
+if is_wandb_available():
+    import wandb
+if is_swanlab_available():
+    import swanlab
+
 
 def round_robin(num_reqs, num_workers):
     """Distribute requests evenly across workers using round-robin algorithm.
@@ -125,6 +134,32 @@ def unmerge_patched(self):
                 del module.unmerge_origin
 
 
+@contextmanager
+def patch_profiling_context(trainer, name: str):
+    start_time = time.perf_counter()
+    yield
+    end_time = time.perf_counter()
+    duration = end_time - start_time
+
+    profiling_metrics = {f'profiling/Time taken: {trainer.__class__.__name__}.{name}': duration}
+
+    if 'wandb' in trainer.args.report_to and wandb.run is not None and trainer.accelerator.is_main_process:
+        wandb.log(profiling_metrics)
+
+    if 'swanlab' in trainer.args.report_to and swanlab.get_run() is not None and trainer.accelerator.is_main_process:
+        swanlab.log(profiling_metrics)
+
+
+def patch_profiling_decorator(func):
+
+    @functools.wraps(func)
+    def wrapper(self, *args, **kwargs):
+        with patch_profiling_context(self, func.__name__):
+            return func(*args, **kwargs)
+
+    return wrapper
+
+
 class _ForwardRedirection:
     """Implements the `forward-redirection`.
     Taken from Pytorch-lightning: