update comments

pan-x-c · pan-x-c · commit 07e1923e7c60 · 2025-05-22T15:09:12.000+08:00
diff --git a/examples/dpo_humanlike/dpo.yaml b/examples/dpo_humanlike/dpo.yaml
@@ -5,15 +5,15 @@ algorithm:
   algorithm_type: dpo
 checkpoint_root_dir: /PATH/TO/CHECKPOINT
 model:
-  model_path: '/PATH/TO/MODEL' # NOTE
+  model_path: '/PATH/TO/MODEL'
   max_prompt_tokens: 1792
   max_response_tokens: 256
 cluster:
   node_num: 1
   gpu_per_node: 8
 buffer:
   total_epochs: 20
-  batch_size: 32 # NOTE
+  batch_size: 32
   max_retry_times: 3
   max_retry_interval: 1
   trainer_input:
diff --git a/examples/dpo_humanlike/train_dpo.yaml b/examples/dpo_humanlike/train_dpo.yaml
@@ -32,7 +32,7 @@ actor_rollout_ref:
     grad_clip: 1.0
     clip_ratio: 0.2
     entropy_coeff: 0.001
-    use_kl_loss: True # NOTE
+    use_kl_loss: True
     kl_loss_coef: 0.1 # NOTE: beta for DPO
     kl_loss_type: low_var_kl # for grpo
     ppo_epochs: 1