bug fix in config_mangager.py

chenyushuo · chenyushuo · commit 949dbfced463 · 2025-04-27T15:39:28.000+08:00
diff --git a/docs/sphinx_doc/source/tutorial/trinity_configs.md b/docs/sphinx_doc/source/tutorial/trinity_configs.md
@@ -375,7 +375,7 @@ trainer:
   save_freq: 100
   # auto: find the last ckpt to resume. If can't find, start from scratch
   resume_mode: auto # or auto or resume_path if
-  resume_from_path: False
+  resume_from_path: ""
   test_freq: 100
   critic_warmup: 0
   default_hdfs_dir: null
@@ -395,8 +395,9 @@ trainer:
 - `actor_rollout_ref.actor.grad_clip`: Gradient clip for actor model training.
 - `actor_rollout_ref.actor.clip_ratio`: Used for compute policy loss.
 - `actor_rollout_ref.actor.entropy_coeff`: Used for compute policy loss.
-- `actor_rollout_ref.actor.use_kl_loss`: True for GRPO.
-- `actor_rollout_ref.actor.kl_loss_coef`: Used for GRPO, optional value is `kl`, `abs`, `mse` or `low_var_kl`.
+- `actor_rollout_ref.actor.use_kl_loss`: Whether to enable kl loss.
+- `actor_rollout_ref.actor.kl_loss_coef`: The coefficient of kl loss.
+- `actor_rollout_ref.actor.kl_loss_type`: How to compute kl loss, optional value is `kl`, `abs`, `mse` or `low_var_kl`.
 - `actor_rollout_ref.actor.ulysses_sequence_parallel_size`: Ulysses sequence parallel size.
 - `actor_rollout_ref.actor.alg_type`: Used for OPMD, optional value is `ppo`, `opmd` or `pairwise_opmd`.
 - `actor_rollout_ref.actor.tau`: strength of regularization w.r.t. old / ref policy.
diff --git a/examples/dpo_humanlike/train_dpo.yaml b/examples/dpo_humanlike/train_dpo.yaml
@@ -173,7 +173,6 @@ trainer:
   save_freq: 30
   # auto: find the last ckpt to resume. If can't find, start from scratch
   resume_mode: auto # or auto or resume_path if
-  resume_from_path: False
   test_freq: 5
   critic_warmup: 0
   default_hdfs_dir: null
diff --git a/examples/grpo_alfworld/train_alfworld.yaml b/examples/grpo_alfworld/train_alfworld.yaml
@@ -172,7 +172,6 @@ trainer:
   save_freq: 1
   # auto: find the last ckpt to resume. If can't find, start from scratch
   resume_mode: auto # or auto or resume_path if
-  resume_from_path: False
   test_freq: 100
   critic_warmup: 0
   default_hdfs_dir: null
diff --git a/examples/grpo_gsm8k/gsm8k.yaml b/examples/grpo_gsm8k/gsm8k.yaml
@@ -35,7 +35,6 @@ buffer:
   train_dataset:
     name: gsm8k_buffer
     storage_type: queue
-    algorithm_type: ppo
     path: 'sqlite:///gsm8k.db'
   # sft_warmup_dataset: # Uncomment these to enable sft warmup
   #   name: warmup_data
diff --git a/examples/grpo_gsm8k/train_gsm8k.yaml b/examples/grpo_gsm8k/train_gsm8k.yaml
@@ -177,7 +177,6 @@ trainer:
   save_freq: 100
   # auto: find the last ckpt to resume. If can't find, start from scratch
   resume_mode: auto # or auto or resume_path if
-  resume_from_path: False
   test_freq: 5
   critic_warmup: 0
   default_hdfs_dir: null
diff --git a/examples/grpo_math/math.yaml b/examples/grpo_math/math.yaml
@@ -27,7 +27,6 @@ buffer:
   train_dataset:
     name: math_buffer
     storage_type: queue
-    algorithm_type: ppo
     path: 'sqlite:////math.db'
 explorer:
   engine_type: vllm_async
diff --git a/examples/grpo_math/train_math.yaml b/examples/grpo_math/train_math.yaml
@@ -169,7 +169,6 @@ trainer:
   save_freq: 100
   # auto: find the last ckpt to resume. If can't find, start from scratch
   resume_mode: auto # or auto or resume_path if
-  resume_from_path: False
   test_freq: 5
   critic_warmup: 0
   default_hdfs_dir: null
diff --git a/examples/grpo_sciworld/sciworld.yaml b/examples/grpo_sciworld/sciworld.yaml
@@ -21,7 +21,6 @@ buffer:
   train_dataset:
     name: sciworld_buffer
     storage_type: queue
-    algorithm_type: ppo
     path: 'sqlite:///sciworld.db'
 explorer:
   engine_type: vllm_async
diff --git a/examples/grpo_sciworld/train_sciworld.yaml b/examples/grpo_sciworld/train_sciworld.yaml
@@ -167,7 +167,6 @@ trainer:
   save_freq: 1
   # auto: find the last ckpt to resume. If can't find, start from scratch
   resume_mode: auto # or auto or resume_path if
-  resume_from_path: False
   test_freq: 100
   critic_warmup: 0
   default_hdfs_dir: null
diff --git a/examples/grpo_webshop/train_webshop.yaml b/examples/grpo_webshop/train_webshop.yaml
@@ -172,7 +172,6 @@ trainer:
   save_freq: 1
   # auto: find the last ckpt to resume. If can't find, start from scratch
   resume_mode: auto # or auto or resume_path if
-  resume_from_path: False
   test_freq: 100
   critic_warmup: 0
   default_hdfs_dir: null
diff --git a/examples/opmd_gsm8k/train_opmd_gsm8k.yaml b/examples/opmd_gsm8k/train_opmd_gsm8k.yaml
@@ -204,7 +204,6 @@ trainer:
   save_freq: 100
   # auto: find the last ckpt to resume. If can't find, start from scratch
   resume_mode: auto # or auto or resume_path if
-  resume_from_path: False
   test_freq: 100
   critic_warmup: 0
   default_hdfs_dir: null
diff --git a/examples/ppo_countdown/train_countdown.yaml b/examples/ppo_countdown/train_countdown.yaml
@@ -179,7 +179,6 @@ trainer:
   save_freq: 100
   # auto: find the last ckpt to resume. If can't find, start from scratch
   resume_mode: auto # or auto or resume_path if
-  resume_from_path: False
   test_freq: 100
   critic_warmup: 0
   default_hdfs_dir: null
diff --git a/trinity/common/verl_config.py b/trinity/common/verl_config.py
@@ -83,13 +83,13 @@ class Actor:
     ppo_epochs: int = 1
     shuffle: bool = False
     ulysses_sequence_parallel_size: int = 1
+    checkpoint: Checkpoint = field(default_factory=Checkpoint)
     optim: Optim = field(default_factory=Optim)
     fsdp_config: FSDPConfig = field(default_factory=FSDPConfig)
     alg_type: str = "ppo"  # ppo / opmd / pairwise_opmd
     tau: float = 0.001  # strength of regularization w.r.t. old / ref policy
     opmd_baseline: str = "mean"  # mean / logavgexp, applicable to opmd
     use_uid: bool = False  # True / False, applicable to pairwise_opmd
-    checkpoint: Checkpoint = field(default_factory=Checkpoint)
 
 
 @dataclass
@@ -205,13 +205,17 @@ class CustomRewardFunction:
 class KL_Ctrl:
     type: str = "fixed"
     kl_coef: float = 0.001
+    horizon: float = 10000
+    target_kl: float = 0.1
 
 
 @dataclass
 class Algorithm:
     gamma: float = 1.0
     lam: float = 1.0
     adv_estimator: str = "gae"
+    norm_adv_by_std_in_grpo: bool = True
+    use_kl_in_reward: bool = False
     kl_penalty: str = "kl"
     kl_ctrl: KL_Ctrl = field(default_factory=KL_Ctrl)
 
@@ -300,7 +304,9 @@ def synchronize_config(self, config: Config) -> None:
         self.actor_rollout_ref.rollout.temperature = config.explorer.temperature
         self.actor_rollout_ref.rollout.n = config.explorer.repeat_times
         batch_size_per_gpu = self.buffer.read_batch_size // world_size
-        self.actor_rollout_ref.actor.alg_type = config.trainer.algorithm_type.value
+        self.actor_rollout_ref.actor.alg_type = (
+            config.trainer.algorithm_type.value
+        )  # TODO: refactor `alg_type`
         print(f"using algorithm type: {self.actor_rollout_ref.actor.alg_type}")
 
         if self.actor_rollout_ref.actor.alg_type == "dpo":  # for DPO
diff --git a/trinity/manager/config_manager.py b/trinity/manager/config_manager.py
@@ -82,7 +82,7 @@ def _init_default_config(self):
             "top_p": 1.0,
             "top_k": -1,
             "seed": 42,
-            "logprobs": None,
+            "logprobs": 0,
             "enable_prefix_caching": False,
             "enforce_eager": True,
             # Trainer Configs
@@ -128,7 +128,7 @@ def _init_default_config(self):
             "actor_grad_clip": 1.0,
             "actor_clip_ratio": 0.2,
             "actor_entropy_coeff": 0.001,
-            "actor_use_kl_loss": False,
+            "actor_use_kl_loss": True,
             "actor_kl_loss_coef": 0.001,
             "actor_kl_loss_type": "low_var_kl",
             "actor_checkpoint": ["model", "hf_model", "optimizer", "extra"],
@@ -161,9 +161,6 @@ def _set_model_path(self):
         if not st.session_state["model_path"].strip():
             self.unfinished_fields.add("model_path")
             st.warning("Please input model path.")
-        elif not os.path.isabs(st.session_state["model_path"].strip()):
-            self.unfinished_fields.add("model_path")
-            st.warning("Please input an absolute path.")
 
     def _set_critic_model_path(self):
         st.text_input(
@@ -308,9 +305,6 @@ def _check_sft_warmup_dataset_path(self):
                 st.warning(
                     "Please input SFT warmup dataset path when `sft_warmup_iteration` is not 0"
                 )
-            elif not os.path.isabs(st.session_state["sft_warmup_dataset_path"].strip()):
-                self.unfinished_fields.add("sft_warmup_dataset_path")
-                st.warning("Please input an absolute path.")
 
     def _set_sft_warmup_dataset_path(self):
         st.text_input("SFT Warmup Dataset Path", key="sft_warmup_dataset_path")
@@ -397,14 +391,31 @@ def _check_engine_num_and_tp_size(self):
                 )
 
     def _set_repeat_times(self):
-        st.number_input("Repeat Times", key="repeat_times", min_value=1)
+        if st.session_state["algorithm_type"] == AlgorithmType.OPMD.value or st.session_state[
+            "adv_estimator"
+        ] in [
+            AdvantageEstimator.GRPO.value,
+            AdvantageEstimator.RLOO.value,
+        ]:
+            min_repeat_times = 2
+        else:
+            min_repeat_times = 1
+        if st.session_state["repeat_times"] < min_repeat_times:
+            st.session_state["repeat_times"] = min_repeat_times
+        st.number_input(
+            "Repeat Times",
+            key="repeat_times",
+            min_value=min_repeat_times,
+            help="`repeat_times` is used to set how many experiences each task can generate, "
+            "and it must be greater than `1` when `algorithm_type` is `opmd` or `grpo`.",
+        )
 
     def _set_sync_method(self):
         st.selectbox(
             "Sync Method",
             ["online", "offline"],
             key="sync_method",
-            help="""`online`: the explorer and trainer switch at sync_iter_interval.
+            help="""`online`: the explorer and trainer sync model weights once every sync_iter_interval steps.
 
 `offline`: the trainer saves the model checkpoint, and the explorer loads it at sync_iter_interval.""",
         )
@@ -633,10 +644,7 @@ def _set_actor_entropy_coeff(self):
         )
 
     def _set_actor_use_kl_loss(self):
-        st.session_state["actor_use_kl_loss"] = (
-            st.session_state["algorithm_type"] == "ppo"
-            and st.session_state["adv_estimator"] == "grpo"
-        )  # TODO: check it
+        st.checkbox("Use KL Loss", key="actor_use_kl_loss")
 
     def _set_actor_kl_loss_coef(self):
         st.number_input(
@@ -645,15 +653,13 @@ def _set_actor_kl_loss_coef(self):
             min_value=0.0,
             max_value=1.0,
             format="%.1e",
-            help="Used in advantage calcuation for GRPO",
         )
 
     def _set_actor_kl_loss_type(self):
         st.selectbox(
             "KL Loss Type",
             ["kl", "abs", "mse", "low_var_kl"],
             key="actor_kl_loss_type",
-            help="Used in advantage calcuation for GRPO",
         )
 
     def _set_actor_tau(self):
@@ -793,7 +799,9 @@ def beginner_mode(self):
         if st.session_state["sft_warmup_iteration"] > 0:
             self._set_sft_warmup_dataset_args()
 
-        self._set_configs_with_st_columns(["default_workflow_type", "default_reward_fn_type"])
+        self._set_configs_with_st_columns(
+            ["default_workflow_type", "default_reward_fn_type", "repeat_times"]
+        )
 
         self._set_configs_with_st_columns(["sync_iteration_interval", "eval_interval", "save_freq"])
 
@@ -1102,8 +1110,8 @@ def _generate_verl_config(self, trainer_nnodes: int = 1, trainer_n_gpus_per_node
                 "strategy": st.session_state["training_strategy"],
                 "optim": {
                     "lr": st.session_state["critic_lr"],
-                    "lr_warmup_steps_ratio": st.session_state["critic_warmup_style"],
-                    "warmup_style": st.session_state["critic_lr_warmup_steps_ratio"],
+                    "lr_warmup_steps_ratio": st.session_state["critic_lr_warmup_steps_ratio"],
+                    "warmup_style": st.session_state["critic_warmup_style"],
                     "total_training_steps": -1
                     if st.session_state["total_training_steps"] is None
                     else st.session_state["total_training_steps"],
@@ -1228,7 +1236,7 @@ def generate_config(self):
         ):
             config = {
                 "data": {
-                    "total_epochs": st.session_state["total_epoch"],
+                    "total_epoch": st.session_state["total_epoch"],
                     "batch_size": st.session_state["task_num_per_batch"],
                     "dataset_path": st.session_state["dataset_path"],
                     "default_workflow_type": st.session_state["default_workflow_type"],