From f49778042138b0a334c6a9c07b84e2edc181c340 Mon Sep 17 00:00:00 2001
From: chenyushuo <297086016@qq.com>
Date: Fri, 23 May 2025 10:35:29 +0800
Subject: [PATCH 1/6] fix config manager

---
 trinity/common/config.py             |   3 +-
 trinity/common/workflows/workflow.py |  16 +-
 trinity/manager/config_manager.py    | 377 ++++++++++++++++-----------
 3 files changed, 234 insertions(+), 162 deletions(-)
diff --git a/trinity/common/config.py b/trinity/common/config.py
index b2703d4d2d..075d6e951f 100644
--- a/trinity/common/config.py
+++ b/trinity/common/config.py
@@ -259,7 +259,6 @@ class ExplorerConfig:
 @dataclass
 class TrainerConfig:
     trainer_type: str = "verl"
-    trainer_config_path: str = ""
     save_interval: int = 0
     enable_preview: bool = True  # enable rollout preview in wandb
 
@@ -271,7 +270,9 @@ class TrainerConfig:
     actor_clip_ratio: float = 0.2
     # TODO: extract more train-related params from underlying trainer engine
 
+    # Only one needs to be set for `trainer_config` and `trainer_config_path`
     trainer_config: Any = field(default_factory=dict)
+    trainer_config_path: str = ""
 
 
 @dataclass
diff --git a/trinity/common/workflows/workflow.py b/trinity/common/workflows/workflow.py
index 1a0daadb2b..9786bd6b77 100644
--- a/trinity/common/workflows/workflow.py
+++ b/trinity/common/workflows/workflow.py
@@ -153,12 +153,12 @@ def __init__(
         task: Task,
         auxiliary_models: Optional[List[openai.OpenAI]] = None,
     ):
+        self.reset(task)
         super().__init__(
             model=model,
             task=task,
             auxiliary_models=auxiliary_models,
         )
-        self.reset(task)
 
     @property
     def resettable(self):
@@ -226,14 +226,12 @@ def __init__(
         task: Task,
         auxiliary_models: Optional[List[openai.OpenAI]] = None,
     ):
-        if task.reward_fn is None:
-            task.reward_fn = MathRewardFn
-        if task.reward_fn == MathRewardFn and task.format_args.system_prompt is None:
-            task.format_args.system_prompt = """A conversation between User and Assistant. The user asks a question, and the Assistant solves it. The assistant first thinks about the reasoning process in the mind and then provides the user with the answer. The reasoning process and answer are enclosed within <think> </think> and <answer> </answer> tags, respectively, i.e.,
-<think> reasoning process here </think>
-<answer> answer here </answer>.
-"""
-        super().__init__(model=model, task=task, auxiliary_models=auxiliary_models)
+        self.reset(task)
+        super().__init__(
+            model=model,
+            task=task,
+            auxiliary_models=auxiliary_models,
+        )
 
     def reset(self, task: Task):
         if task.reward_fn is None:
diff --git a/trinity/manager/config_manager.py b/trinity/manager/config_manager.py
index 21b0e57348..c8a168d1dc 100644
--- a/trinity/manager/config_manager.py
+++ b/trinity/manager/config_manager.py
@@ -54,7 +54,7 @@ def _init_default_config(self):
             # Model Configs
             "model_path": "",
             "critic_model_path": "",
-            "checkpoint_path": "",
+            "checkpoint_root_dir": "",
             "node_num": 1,
             "gpu_per_node": 8,
             "total_gpu_num": 8,
@@ -109,6 +109,7 @@ def _init_default_config(self):
             "_not_grouped_adv_repeat_times": 1,
             "repeat_times": 1,
             "tensor_parallel_size": 1,
+            "use_v1": True,
             "enable_prefix_caching": False,
             "enforce_eager": True,
             "dtype": "bfloat16",
@@ -119,8 +120,11 @@ def _init_default_config(self):
             "logprobs": 0,
             "gpu_memory_utilization": 0.9,
             "enable_chunked_prefill": False,
+            "enable_thinking": False,
+            "enable_openai_api": False,
             "max_timeout": 900,
             "explorer_max_retry_times": 2,
+            "eval_on_latest_checkpoint": True,
             # Synchronizer Configs
             "_not_dpo_sync_method": SyncMethod.NCCL.value,
             "sync_method": SyncMethod.NCCL.value,
@@ -204,7 +208,7 @@ def maintain_session_state(self):
     def _set_project(self):
         st.text_input("Project", key="project")
 
-    def _set_name(self):
+    def _set_exp_name(self):
         st.text_input("Experiment Name", key="exp_name")
 
     def _set_monitor_type(self):
@@ -221,18 +225,19 @@ def _set_model_path(self):
             st.warning("Please input model path.")
 
     def _set_critic_model_path(self):
-        st.text_input(
-            "Critic Model Path (defaults to `model_path`)",
-            key="critic_model_path",
-        )
+        if st.session_state["adv_estimator"] == AdvantageEstimator.GAE.value:
+            st.text_input(
+                "Critic Model Path (defaults to `model_path`)",
+                key="critic_model_path",
+            )
 
-    def _set_checkpoint_path(self):
-        st.text_input("Checkpoint Path", key="checkpoint_path")
-        if not st.session_state["checkpoint_path"].strip():  # TODO: may auto generate
-            self.unfinished_fields.add("checkpoint_path")
-            st.warning("Please input checkpoint path.")
-        elif not os.path.isabs(st.session_state["checkpoint_path"].strip()):
-            self.unfinished_fields.add("checkpoint_path")
+    def _set_checkpoint_root_dir(self):
+        st.text_input("Checkpoint Root Dir", key="checkpoint_root_dir")
+        if not st.session_state["checkpoint_root_dir"].strip():  # TODO: may auto generate
+            self.unfinished_fields.add("checkpoint_root_dir")
+            st.warning("Please input checkpoint root dir.")
+        elif not os.path.isabs(st.session_state["checkpoint_root_dir"].strip()):
+            self.unfinished_fields.add("checkpoint_root_dir")
             st.warning("Please input an absolute path.")
 
     def _set_node_num(self):
@@ -346,8 +351,9 @@ def _set_taskset_args(self):
             response_key_col.text_input(
                 "Response Key :orange-badge[(Needs review)]", key="taskset_response_key"
             )
+            self._set_configs_with_st_columns(["temperature", "logprobs"])
 
-    def _set_eval_taskset_idx(self, idx):
+    def _set_eval_taskset_idx(self, idx):  # TODO: add delete
         st.text_input(
             "Taskset Name",
             key=f"eval_taskset_{idx}_name",
@@ -457,7 +463,7 @@ def _set_experience_buffer_path(self):  # TODO
 
 if `storage_type == StorageType.QUEUE`, default to `None`,
 
-if `storage_type == StorageType.SQL`, default to `sqlite:///{os.path.join(checkpoint_path, '.cache', project_name, experiment_name)}/data.db`."""
+if `storage_type == StorageType.SQL`, default to `sqlite:///{os.path.join(checkpoint_root_dir, '.cache', project_name, experiment_name)}/data.db`."""
 
         def on_change():
             if st.session_state["algorithm_type"] == AlgorithmType.DPO.value:
@@ -545,7 +551,9 @@ def _set_sft_warmup_dataset_args(self):
                 sft_warmup_messages_key_col,
                 sft_warmup_prompt_key_col,
                 sft_warmup_response_key_col,
-            ) = st.columns(3)
+            ) = st.columns(
+                3
+            )  # TODO: select by prompt type
             sft_warmup_messages_key_col.text_input(
                 "SFT Dataset Messages Key :orange-badge[(Needs review)]",
                 key="sft_warmup_messages_key",
@@ -620,6 +628,33 @@ def _check_engine_num_and_tp_size(self):
                     "Please ensure that `engine_num * tensor_parallel_size` can be divided by `gpu_per_node` when `node_num > 1`."
                 )
 
+    def _set_repeat_times(self):  # TODO
+        grouped_adv_algorithms = [
+            AlgorithmType.GRPO.value,
+            AlgorithmType.OPMD.value,  # TODO: may add rloo
+        ]
+        if st.session_state["algorithm_type"] in grouped_adv_algorithms:
+            min_repeat_times = 2
+            st.session_state["repeat_times"] = st.session_state["_grouped_adv_repeat_times"]
+        else:
+            min_repeat_times = 1
+            st.session_state["repeat_times"] = st.session_state["_not_grouped_adv_repeat_times"]
+
+        def on_change():
+            if st.session_state["algorithm_type"] in grouped_adv_algorithms:
+                st.session_state["_grouped_adv_repeat_times"] = st.session_state["repeat_times"]
+            else:
+                st.session_state["_not_grouped_adv_repeat_times"] = st.session_state["repeat_times"]
+
+        st.number_input(
+            "Repeat Times",
+            key="repeat_times",
+            min_value=min_repeat_times,
+            help="`repeat_times` is used to set how many experiences each task can generate, "
+            "and it must be greater than `1` when `algorithm_type` is `opmd` or `grpo`.",
+            on_change=on_change,
+        )
+
     def _set_sync_method(self):
         if st.session_state["algorithm_type"] == AlgorithmType.DPO.value:
             st.session_state["sync_method"] = SyncMethod.CHECKPOINT.value
@@ -686,6 +721,9 @@ def _set_seed(self):
     def _set_logprobs(self):
         st.number_input("Logprobs", key="logprobs", min_value=0, max_value=20)
 
+    def _set_use_v1(self):
+        st.checkbox("Use V1 Engine", key="use_v1")
+
     def _set_enable_prefix_caching(self):
         st.checkbox("Prefix Caching", key="enable_prefix_caching")
 
@@ -700,6 +738,12 @@ def _set_gpu_memory_utilization(self):
     def _set_enable_chunked_prefill(self):
         st.checkbox("Chunked Prefill", key="enable_chunked_prefill")
 
+    def _set_enable_thinking(self):
+        st.checkbox("Enable Thinking For Qwen3", key="enable_thinking")
+
+    def _set_enable_openai_api(self):
+        st.checkbox("Enable OpenAI API", key="enable_openai_api")
+
     def _set_max_timeout(self):
         st.number_input("Max Timeout", key="max_timeout", min_value=0)
 
@@ -745,6 +789,9 @@ def _set_sft_warmup_steps(self):
     def _set_eval_interval(self):
         st.number_input("Eval Interval", key="eval_interval", min_value=1)
 
+    def _set_eval_on_latest_checkpoint(self):
+        st.checkbox("Eval on Latest Checkpoint", key="eval_on_latest_ckp")
+
     def _set_training_args(self):
         st.multiselect(
             "Training Args",
@@ -1105,11 +1152,11 @@ def _set_configs_with_st_columns(
 
     def beginner_mode(self):
         st.header("Essential Configs")
-        self._set_configs_with_st_columns(["project", "name"], columns_config=[1, 3])
+        self._set_configs_with_st_columns(["project", "exp_name"], columns_config=[1, 3])
 
         self._set_model_path()
 
-        self._set_checkpoint_path()
+        self._set_checkpoint_root_dir()
 
         self._set_taskset_path()
 
@@ -1169,12 +1216,12 @@ def beginner_mode(self):
             self._set_configs_with_st_columns(["critic_ppo_micro_batch_size_per_gpu", "critic_lr"])
 
     def _expert_model_part(self):
-        self._set_configs_with_st_columns(["project", "name"], columns_config=[1, 3])
+        self._set_configs_with_st_columns(["project", "exp_name"], columns_config=[1, 3])
 
         self._set_model_path()
         self._set_critic_model_path()
 
-        self._set_checkpoint_path()
+        self._set_checkpoint_root_dir()
 
         self._set_configs_with_st_columns(["monitor_type", "node_num", "gpu_per_node"])
         self._set_configs_with_st_columns(["max_prompt_tokens", "max_response_tokens"])
@@ -1213,34 +1260,36 @@ def _expert_buffer_part(self):
             self._set_configs_with_st_columns(["buffer_max_retry_times", "max_retry_interval"])
 
     def _expert_explorer_part(self):
+        self._set_configs_with_st_columns(["sync_method", "sync_interval", "sync_timeout"])
+
         self._set_configs_with_st_columns(
-            ["engine_type", "engine_num", "tensor_parallel_size", "repeat_times"]
+            [
+                "runner_num",
+                "max_timeout",
+                "explorer_max_retry_times",
+            ]
         )
-        self._check_engine_num_and_tp_size()
 
-        self._set_configs_with_st_columns(["sync_method", "sync_interval", "sync_timeout"])
+        self._set_configs_with_st_columns(["eval_interval", "eval_on_latest_checkpoint"])
 
-        with st.expander("Advanced Config"):
-            self._set_configs_with_st_columns(
-                ["runner_num", "temperature", "top_p", "top_k", "seed", "logprobs"]
-            )
+        with st.expander("Rollout Model Config", expanded=True):
+            self._set_configs_with_st_columns(["engine_type", "engine_num", "tensor_parallel_size"])
+            self._check_engine_num_and_tp_size()
 
-            self._set_configs_with_st_columns(["dtype", "gpu_memory_utilization"])
-            self._set_configs_with_st_columns(
-                [
-                    "max_timeout",
-                    "explorer_max_retry_times",
-                ]
-            )
+            self._set_configs_with_st_columns(["gpu_memory_utilization", "dtype", "seed"])
 
             self._set_configs_with_st_columns(
-                ["enable_prefix_caching", "enforce_eager", "enable_chunked_prefill"]
+                ["use_v1", "enforce_eager", "enable_prefix_caching", "enable_chunked_prefill"]
             )
 
+            self._set_configs_with_st_columns(["enable_thinking", "enable_openai_api"])
+
+        with st.expander("Auxiliary Models", expanded=True):  # TODO
+            pass
+
     def _expert_trainer_part(self):
-        self._set_configs_with_st_columns(  # TODO: may add `trainer_type`
-            ["algorithm_type", "sft_warmup_steps", "eval_interval", "save_interval"]
-        )
+        self._set_configs_with_st_columns(["algorithm_type", "gamma", "lam"])
+        self._set_configs_with_st_columns(["repeat_times", "save_interval"])
         self._check_sft_warmup_dataset_path()
 
         if st.session_state["trainer_type"] == "verl":
@@ -1280,7 +1329,6 @@ def _expert_verl_trainer_part(self):
 
         with rl_algorithm_tab:
             st.subheader("RL Algorithm Config")
-            self._set_configs_with_st_columns(["gamma", "lam"])
             self._set_configs_with_st_columns(["norm_adv_by_std_in_grpo", "use_kl_in_reward"])
             self._set_configs_with_st_columns(["kl_penalty", "kl_ctrl_type", "kl_ctrl_coef"])
             self._set_configs_with_st_columns(["horizon", "target_kl"])
@@ -1555,7 +1603,7 @@ def _generate_verl_config(self, trainer_nnodes: int = 1, trainer_n_gpus_per_node
                 "default_hdfs_dir": st.session_state["default_hdfs_dir"],
                 "remove_previous_ckpt_in_save": st.session_state["remove_previous_ckpt_in_save"],
                 "del_local_ckpt_after_load": st.session_state["del_local_ckpt_after_load"],
-                "default_local_dir": st.session_state["checkpoint_path"],
+                "default_local_dir": st.session_state["checkpoint_root_dir"],
                 "val_before_train": False,
                 "sync_freq": st.session_state["sync_interval"],
                 "max_actor_ckpt_to_keep": st.session_state["max_actor_ckpt_to_keep"],
@@ -1564,6 +1612,123 @@ def _generate_verl_config(self, trainer_nnodes: int = 1, trainer_n_gpus_per_node
         }
         return trainer_config
 
+    def _gen_buffer_config(self):
+        if st.session_state["algorithm_type"] != AlgorithmType.DPO.value:
+            experience_buffer_path = st.session_state["experience_buffer_path"].strip()
+            if (
+                not experience_buffer_path
+                and st.session_state["storage_type"] == StorageType.SQL.value
+            ):
+                experience_buffer_path = f"sqlite:///{os.path.join(st.session_state['checkpoint_root_dir'], '.cache', st.session_state['project'], st.session_state['exp_name'])}/data.db"
+
+        sft_storage_type = (
+            StorageType.SQL.value
+            if "://" in st.session_state["sft_warmup_dataset_path"]
+            else StorageType.FILE.value
+        )  # TODO
+
+        buffer_config = {
+            "batch_size": st.session_state["train_batch_size"],
+            "total_epochs": st.session_state["total_epochs"],
+            "explorer_input": {
+                "taskset": {
+                    "name": "taskset",
+                    "storage_type": StorageType.FILE.value,
+                    "path": st.session_state["taskset_path"],
+                    "split": st.session_state["taskset_split"],
+                    "subset_name": st.session_state["taskset_subset_name"],
+                    "format": {
+                        "prompt_key": st.session_state["taskset_prompt_key"],
+                        "response_key": st.session_state["taskset_response_key"],
+                    },
+                    "rollout_args": {
+                        "temperature": st.session_state["temperature"],
+                        "logprobs": st.session_state["logprobs"],
+                    },
+                },
+                "eval_tasksets": [],
+                "default_workflow_type": st.session_state["default_workflow_type"],
+                "default_reward_fn_type": st.session_state["default_reward_fn_type"],
+                "system_prompt": st.session_state["system_prompt"],
+                "reply_prefix": st.session_state["reply_prefix"],
+            },
+            "trainer_input": {
+                "experience_buffer": {
+                    "name": "experience_buffer",
+                    "storage_type": st.session_state["storage_type"],
+                    "path": experience_buffer_path,
+                },
+                "sft_warmup_steps": st.session_state["sft_warmup_steps"],
+            },
+            "max_retry_times": st.session_state["buffer_max_retry_times"],
+            "max_retry_interval": st.session_state["max_retry_interval"],
+        }
+
+        for idx in range(st.session_state["_eval_tasksets_num"]):
+            if st.session_state[f"eval_taskset_{idx}_path"].strip():
+                buffer_config["explorer_input"]["eval_tasksets"].append(
+                    {
+                        "name": st.session_state[f"eval_taskset_{idx}_name"],
+                        "path": st.session_state[f"eval_taskset_{idx}_path"],
+                        "subset_name": st.session_state[f"eval_taskset_{idx}_subset_name"],
+                        "split": st.session_state[f"eval_taskset_{idx}_split"],
+                        "prompt_key": st.session_state[f"eval_taskset_{idx}_prompt_key"],
+                        "response_key": st.session_state[f"eval_taskset_{idx}_response_key"],
+                    }
+                )
+        if st.session_state["algorithm_type"] == AlgorithmType.DPO.value:
+            experience_buffer = buffer_config["trainer_input"]["experience_buffer"]
+            experience_buffer["split"] = st.session_state["dpo_dataset_train_split"]
+            experience_buffer["format"] = {
+                "prompt_type": st.session_state["dpo_dataset_prompt_type"],
+                "prompt_key": st.session_state["dpo_dataset_prompt_key"],
+                "chosen_key": st.session_state["dpo_dataset_chosen_key"],
+                "rejected_key": st.session_state["dpo_dataset_rejected_key"],
+            }
+        if st.session_state["sft_warmup_dataset_path"].strip():
+            buffer_config["trainer_input"]["sft_warmup_dataset"] = {
+                "name": "sft_warmup_dataset",
+                "storage_type": sft_storage_type,
+                "path": st.session_state["sft_warmup_dataset_path"],
+                "split": st.session_state["sft_warmup_train_split"],
+                "format": {
+                    "prompt_type": st.session_state["sft_warmup_prompt_type"],
+                    "messages_key": st.session_state["sft_warmup_messages_key"],
+                    "prompt_key": st.session_state["sft_warmup_prompt_key"],
+                    "response_key": st.session_state["sft_warmup_response_key"],
+                },
+            }
+
+        return buffer_config
+
+    def _gen_explorer_config(self):
+        explorer_config = {
+            "runner_num": st.session_state["runner_num"],
+            "max_timeout": st.session_state["max_timeout"],
+            "max_retry_times": st.session_state["explorer_max_retry_times"],
+            "rollout_model": {
+                "engine_type": st.session_state["engine_type"],
+                "engine_num": st.session_state["engine_num"],
+                "tensor_parallel_size": st.session_state["tensor_parallel_size"],
+                "use_v1": st.session_state["use_v1"],
+                "enforce_eager": st.session_state["enforce_eager"],
+                "enable_prefix_caching": st.session_state["enable_prefix_caching"],
+                "enable_chunked_prefill": st.session_state["enable_chunked_prefill"],
+                "gpu_memory_utilization": st.session_state["gpu_memory_utilization"],
+                "dtype": st.session_state["dtype"],
+                "seed": st.session_state["seed"],
+                # "max_prompt_tokens": None,  # TODO
+                # "max_response_tokens": None,  # TODO
+                # "chat_template": None,  # TODO: add chat template
+                "enable_thinking": st.session_state["enable_thinking"],
+                "enable_openai_api": st.session_state["enable_openai_api"],
+            },
+            "auxiliary_models": [],
+            "eval_interval": st.session_state["eval_interval"],
+            "eval_on_latest_checkpoint": st.session_state["eval_on_latest_checkpoint"],
+        }
+        return explorer_config
+
     def generate_config(self):
         if st.session_state["mode"] == "both":
             trainer_nnodes = (
@@ -1582,19 +1747,6 @@ def generate_config(self):
         else:
             trainer_n_gpus_per_node = st.session_state["gpu_per_node"]
 
-        if st.session_state["algorithm_type"] != AlgorithmType.DPO.value:
-            experience_buffer_path = st.session_state["experience_buffer_path"].strip()
-            if (
-                not experience_buffer_path
-                and st.session_state["storage_type"] == StorageType.SQL.value
-            ):
-                experience_buffer_path = f"sqlite:///{os.path.join(st.session_state['checkpoint_path'], '.cache', st.session_state['project'], st.session_state['exp_name'])}/data.db"
-
-        sft_storage_type = (
-            StorageType.SQL.value
-            if "://" in st.session_state["sft_warmup_dataset_path"]
-            else StorageType.FILE.value
-        )  # TODO
         if st.session_state["trainer_type"] == "verl":
             trainer_config = self._generate_verl_config(
                 trainer_nnodes=trainer_nnodes, trainer_n_gpus_per_node=trainer_n_gpus_per_node
@@ -1623,12 +1775,15 @@ def generate_config(self):
             config = {
                 "mode": st.session_state["mode"],
                 "project": st.session_state["project"],
-                "name": st.session_state["name"],
-                "checkpoint_root_dir": st.session_state["checkpoint_path"],
+                "name": st.session_state["exp_name"],
+                "checkpoint_root_dir": st.session_state["checkpoint_root_dir"],
                 "algorithm": {
                     "algorithm_type": st.session_state["algorithm_type"],
                     "repeat_times": st.session_state["repeat_times"],
+                    "gamma": st.session_state["gamma"],
+                    "lam": st.session_state["lam"],
                 },
+                "data_processor": {},  # TODO: Add data processor config
                 "model": {
                     "model_path": st.session_state["model_path"],
                     "max_prompt_tokens": st.session_state["max_prompt_tokens"],
@@ -1638,75 +1793,27 @@ def generate_config(self):
                     "node_num": st.session_state["node_num"],
                     "gpu_per_node": st.session_state["gpu_per_node"],
                 },
-                "buffer": {
-                    "total_epochs": st.session_state["total_epochs"],
-                    "batch_size": st.session_state["train_batch_size"],
-                    "max_retry_times": st.session_state["buffer_max_retry_times"],
-                    "max_retry_interval": st.session_state["max_retry_interval"],
-                    "explorer_input": {
-                        "taskset": {
-                            "name": "taskset",
-                            "storage_type": StorageType.FILE.value,
-                            "path": st.session_state["taskset_path"],
-                            "split": st.session_state["taskset_split"],
-                            "subset_name": st.session_state["taskset_subset_name"],
-                            "format": {
-                                "prompt_key": st.session_state["taskset_prompt_key"],
-                                "response_key": st.session_state["taskset_response_key"],
-                            },
-                            "rollout_args": {
-                                "n": st.session_state["repeat_times"],
-                                "temperature": st.session_state["temperature"],
-                                "top_p": st.session_state["top_p"],
-                                "top_k": st.session_state["top_k"],
-                                "logprobs": st.session_state["logprobs"],
-                            },
-                        },
-                        "eval_tasksets": [],  # TODO: add eval tasksets
-                        "default_workflow_type": st.session_state["default_workflow_type"],
-                        "default_reward_fn_type": st.session_state["default_reward_fn_type"],
-                        "system_prompt": st.session_state["system_prompt"],
-                        "reply_prefix": st.session_state["reply_prefix"],
-                    },
-                    "trainer_input": {
-                        "experience_buffer": {
-                            "name": "experience_buffer",
-                            "storage_type": st.session_state["storage_type"],
-                            "path": experience_buffer_path,
-                        },
-                        "sft_warmup_steps": st.session_state["sft_warmup_steps"],
-                    },
-                },
-                "explorer": {
-                    "eval_interval": st.session_state["eval_interval"],
-                    "engine_type": st.session_state["engine_type"],
-                    "engine_num": st.session_state["engine_num"],
-                    "runner_num": st.session_state["runner_num"],
-                    # "chat_template": None,  # TODO: add chat template
-                    "tensor_parallel_size": st.session_state["tensor_parallel_size"],
-                    "enable_prefix_caching": st.session_state["enable_prefix_caching"],
-                    "enforce_eager": st.session_state["enforce_eager"],
-                    "dtype": st.session_state["dtype"],
-                    "seed": st.session_state["seed"],
-                    "gpu_memory_utilization": st.session_state["gpu_memory_utilization"],
-                    "enable_chunked_prefill": st.session_state["enable_chunked_prefill"],
-                    "use_v1": True,
-                    "max_timeout": st.session_state["max_timeout"],
-                    "max_retry_times": st.session_state["explorer_max_retry_times"],
-                },
-                "synchronizer": {
-                    "sync_method": st.session_state["sync_method"],
-                    "sync_interval": st.session_state["sync_interval"],
-                    "sync_timeout": st.session_state["sync_timeout"],
-                },
+                "buffer": self._gen_buffer_config(),
+                "explorer": self._gen_explorer_config(),
                 "trainer": {
                     "trainer_type": st.session_state["trainer_type"],
-                    "trainer_config": trainer_config,
                     "save_interval": st.session_state["save_interval"],
+                    "enable_preview": True,  # TODO
+                    "actor_use_kl_loss": st.session_state["actor_use_kl_loss"],
+                    "actor_kl_loss_coef": st.session_state["actor_kl_loss_coef"],
+                    "actor_entropy_coef": st.session_state["actor_entropy_coef"],
+                    "actor_grad_clip": st.session_state["actor_grad_clip"],
+                    "actor_clip_ratio": st.session_state["actor_clip_ratio"],
+                    "trainer_config": trainer_config,
                 },
                 "monitor": {
                     "monitor_type": st.session_state["monitor_type"],
                 },
+                "synchronizer": {
+                    "sync_method": st.session_state["sync_method"],
+                    "sync_interval": st.session_state["sync_interval"],
+                    "sync_timeout": st.session_state["sync_timeout"],
+                },
             }
 
             if st.session_state["adv_estimator"] == AdvantageEstimator.GAE.value:
@@ -1716,40 +1823,6 @@ def generate_config(self):
                     else st.session_state["model_path"]
                 )
 
-            for idx in range(st.session_state["_eval_tasksets_num"]):
-                if st.session_state[f"eval_taskset_{idx}_path"].strip():
-                    config["buffer"]["explorer_input"]["eval_tasksets"].append(
-                        {
-                            "name": st.session_state[f"eval_taskset_{idx}_name"],
-                            "path": st.session_state[f"eval_taskset_{idx}_path"],
-                            "subset_name": st.session_state[f"eval_taskset_{idx}_subset_name"],
-                            "split": st.session_state[f"eval_taskset_{idx}_split"],
-                            "prompt_key": st.session_state[f"eval_taskset_{idx}_prompt_key"],
-                            "response_key": st.session_state[f"eval_taskset_{idx}_response_key"],
-                        }
-                    )
-            if st.session_state["algorithm_type"] == AlgorithmType.DPO.value:
-                experience_buffer = config["buffer"]["trainer_input"]["experience_buffer"]
-                experience_buffer["split"] = st.session_state["dpo_dataset_train_split"]
-                experience_buffer["format"] = {
-                    "prompt_type": st.session_state["dpo_dataset_prompt_type"],
-                    "prompt_key": st.session_state["dpo_dataset_prompt_key"],
-                    "chosen_key": st.session_state["dpo_dataset_chosen_key"],
-                    "rejected_key": st.session_state["dpo_dataset_rejected_key"],
-                }
-            if st.session_state["sft_warmup_dataset_path"].strip():
-                config["buffer"]["trainer_input"]["sft_warmup_dataset"] = {
-                    "name": "sft_warmup_dataset",
-                    "storage_type": sft_storage_type,
-                    "path": st.session_state["sft_warmup_dataset_path"],
-                    "split": st.session_state["sft_warmup_train_split"],
-                    "format": {
-                        "prompt_type": st.session_state["sft_warmup_prompt_type"],
-                        "messages_key": st.session_state["sft_warmup_messages_key"],
-                        "prompt_key": st.session_state["sft_warmup_prompt_key"],
-                        "response_key": st.session_state["sft_warmup_response_key"],
-                    },
-                }
             st.session_state.config_generated = True
             st.header("Generated Config File")
             buttons = st.container()
@@ -1758,7 +1831,7 @@ def generate_config(self):
             save_btn.download_button(
                 "Save",
                 data=yaml_config,
-                file_name=f"{config['monitor']['project']}-{config['monitor']['name']}.yaml",
+                file_name=f"{config['project']}-{config['name']}.yaml",
                 mime="text/plain",
                 icon=":material/download:",
                 use_container_width=True,

From c2c4cc1b8ddec931f18248c397fef6f18eb47ab3 Mon Sep 17 00:00:00 2001
From: chenyushuo <297086016@qq.com>
Date: Fri, 23 May 2025 11:37:00 +0800
Subject: [PATCH 2/6] remove dup args in verl config

---
 trinity/common/verl_config.py     |  97 ++++++---------
 trinity/manager/config_manager.py | 188 +++++-------------------------
 2 files changed, 70 insertions(+), 215 deletions(-)

diff --git a/trinity/common/verl_config.py b/trinity/common/verl_config.py
index dd896a23f1..0b04ef4a6a 100644
--- a/trinity/common/verl_config.py
+++ b/trinity/common/verl_config.py
@@ -1,3 +1,4 @@
+import math
 from dataclasses import dataclass, field
 from typing import Any, Dict, List, Optional
 
@@ -13,20 +14,7 @@
 
 @dataclass
 class Data:
-    tokenizer: Optional[str] = None
-    train_files: str = ""
-    val_files: str = ""
-    prompt_key: str = "prompt"
-    max_prompt_length: int = 512
-    max_response_length: int = 512
     train_batch_size: int = 1024
-    val_batch_size: Optional[int] = None
-    return_raw_input_ids: bool = False
-    return_raw_chat: bool = False
-    shuffle: bool = True
-    filter_overlong_prompts: bool = False
-    truncation: str = "error"
-    image_key: str = "images"
 
 
 @dataclass
@@ -109,30 +97,7 @@ class Ref:
 
 @dataclass
 class Rollout:
-    name: str = "vllm"
     temperature: float = 1.0
-    top_k: int = -1
-    top_p: float = 1.0
-    use_fire_sampling: bool = False
-    prompt_length: int = 0
-    response_length: int = 0
-    dtype: str = "bfloat16"
-    gpu_memory_utilization: float = 0.5
-    ignore_eos: bool = False
-    enforce_eager: bool = True
-    free_cache_engine: bool = True
-    load_format: str = "dummy_dtensor"
-    tensor_model_parallel_size: int = 2
-    max_num_batched_tokens: int = 8192
-    max_model_len: Optional[int] = None
-    max_num_seqs: int = 1024
-    log_prob_micro_batch_size: Optional[int] = None
-    log_prob_micro_batch_size_per_gpu: int = 1
-    log_prob_use_dynamic_bsz: bool = False
-    log_prob_max_token_len_per_gpu: int = 0
-    disable_log_stats: bool = True
-    enable_chunked_prefill: bool = True
-    do_sample: bool = True
     n: int = 1  # > 1 for grpo
 
 
@@ -283,36 +248,50 @@ def synchronize_config(self, config: Config) -> None:
             )
         else:
             rollout_gpu_num = 0
-        rollout_node_num = rollout_gpu_num // config.cluster.gpu_per_node
-        self.trainer.nnodes = config.cluster.node_num - rollout_node_num
-        self.actor_rollout_ref.model.path = config.model.model_path
-        self.critic.model.path = config.model.critic_model_path
-        self.critic.model.tokenizer_path = config.model.critic_model_path
 
         if config.cluster.node_num == 1:
             # for single node scenarios, rollout and training are on the same node
+            self.trainer.nnodes = config.cluster.node_num
             self.trainer.n_gpus_per_node = config.cluster.gpu_per_node - rollout_gpu_num
         else:
             # for multi-node scenarios, some nodes for rollout, others for training
+            assert (
+                rollout_gpu_num % config.cluster.gpu_per_node == 0
+            ), "rollout_gpu_num must be divisible by `gpu_per_node`"
+            rollout_node_num = math.ceil(rollout_gpu_num / config.cluster.gpu_per_node)
+            self.trainer.nnodes = config.cluster.node_num - rollout_node_num
+            if self.trainer.nnodes < 1:
+                raise ValueError("The number of training nodes must be greater than 0")
             self.trainer.n_gpus_per_node = config.cluster.gpu_per_node
-        self.trainer.sync_freq = config.synchronizer.sync_interval
-        self.trainer.save_freq = config.trainer.save_interval
-        self.synchronizer = config.synchronizer
-        self.actor_rollout_ref.synchronizer = config.synchronizer
-        self.buffer = config.buffer
+
         world_size = self.trainer.nnodes * self.trainer.n_gpus_per_node
         if config.buffer.batch_size % world_size != 0:
             raise ValueError(
                 f"batch_size ({config.buffer.batch_size}) must be divisible by ({world_size})"
             )
-        # TODO: use dynamic read_batch_size to support multi-round scenarios
-        # Get the experiences of one explore step
+
+        self.trainer.sync_freq = config.synchronizer.sync_interval
+        self.trainer.save_freq = config.trainer.save_interval
         self.trainer.project_name = config.project
         self.trainer.experiment_name = config.name
-        self.data.train_batch_size = config.buffer.batch_size
         self.trainer.default_local_dir = config.checkpoint_job_dir
         self.trainer.sft_warmup_steps = config.buffer.trainer_input.sft_warmup_steps
-        self.actor_rollout_ref.actor.ppo_mini_batch_size = config.buffer.batch_size
+
+        self.buffer = config.buffer
+        # TODO: use dynamic read_batch_size to support multi-round scenarios
+        # Get the experiences of one explore step
+        self.data.train_batch_size = config.buffer.batch_size
+
+        self.synchronizer = config.synchronizer
+        self.actor_rollout_ref.synchronizer = config.synchronizer
+
+        # Actor / Critic config
+        self.actor_rollout_ref.model.path = config.model.model_path
+        self.critic.model.path = config.model.critic_model_path
+        self.critic.model.tokenizer_path = config.model.critic_model_path
+        self.actor_rollout_ref.actor.ppo_mini_batch_size = (
+            config.buffer.batch_size
+        )  # TODO: may allow user to change
         self.actor_rollout_ref.rollout.temperature = (
             config.buffer.explorer_input.taskset.rollout_args.temperature
         )
@@ -320,6 +299,15 @@ def synchronize_config(self, config: Config) -> None:
         self.critic.ppo_mini_batch_size = config.buffer.batch_size
         self.critic.rollout_n = self.actor_rollout_ref.rollout.n
 
+        self.actor_rollout_ref.actor.use_kl_loss = config.trainer.actor_use_kl_loss
+        self.actor_rollout_ref.actor.kl_loss_coef = config.trainer.actor_kl_loss_coef
+        self.actor_rollout_ref.actor.entropy_coeff = config.trainer.actor_entropy_coef
+        self.actor_rollout_ref.actor.grad_clip = config.trainer.actor_grad_clip
+        self.actor_rollout_ref.actor.clip_ratio = config.trainer.actor_clip_ratio
+
+        # Algorithm related config
+        self.algorithm.gamma = config.algorithm.gamma
+        self.algorithm.lam = config.algorithm.lam
         self.actor_rollout_ref.actor.algorithm_type = config.algorithm.algorithm_type
         if config.algorithm.algorithm_type == AlgorithmType.PPO:
             logger.info("Using GAE `adv_estimator` for PPO")
@@ -328,15 +316,6 @@ def synchronize_config(self, config: Config) -> None:
             logger.info("Using GRPO `adv_estimator` for GRPO")
             self.algorithm.adv_estimator = AdvantageEstimator.GRPO.value
 
-        # copy trainer related config from global config
-        self.algorithm.gamma = config.algorithm.gamma
-        self.algorithm.lam = config.algorithm.lam
-        self.actor_rollout_ref.actor.use_kl_loss = config.trainer.actor_use_kl_loss
-        self.actor_rollout_ref.actor.kl_loss_coef = config.trainer.actor_kl_loss_coef
-        self.actor_rollout_ref.actor.entropy_coeff = config.trainer.actor_entropy_coef
-        self.actor_rollout_ref.actor.grad_clip = config.trainer.actor_grad_clip
-        self.actor_rollout_ref.actor.clip_ratio = config.trainer.actor_clip_ratio
-
         if self.actor_rollout_ref.actor.algorithm_type.is_dpo():  # for DPO
             if not self.actor_rollout_ref.actor.use_kl_loss:
                 self.actor_rollout_ref.actor.use_kl_loss = True
diff --git a/trinity/manager/config_manager.py b/trinity/manager/config_manager.py
index c8a168d1dc..2b7f5f5b8d 100644
--- a/trinity/manager/config_manager.py
+++ b/trinity/manager/config_manager.py
@@ -834,33 +834,6 @@ def on_change():
     def _set_ppo_epochs(self):
         st.number_input("PPO Epochs", key="ppo_epochs", min_value=1)
 
-    def _set_repeat_times(self):  # TODO
-        grouped_adv_algorithms = [
-            AlgorithmType.GRPO.value,
-            AlgorithmType.OPMD.value,  # TODO: may add rloo
-        ]
-        if st.session_state["algorithm_type"] in grouped_adv_algorithms:
-            min_repeat_times = 2
-            st.session_state["repeat_times"] = st.session_state["_grouped_adv_repeat_times"]
-        else:
-            min_repeat_times = 1
-            st.session_state["repeat_times"] = st.session_state["_not_grouped_adv_repeat_times"]
-
-        def on_change():
-            if st.session_state["algorithm_type"] in grouped_adv_algorithms:
-                st.session_state["_grouped_adv_repeat_times"] = st.session_state["repeat_times"]
-            else:
-                st.session_state["_not_grouped_adv_repeat_times"] = st.session_state["repeat_times"]
-
-        st.number_input(
-            "Repeat Times",
-            key="repeat_times",
-            min_value=min_repeat_times,
-            help="`repeat_times` is used to set how many experiences each task can generate, "
-            "and it must be greater than `1` when `algorithm_type` is `opmd` or `grpo`.",
-            on_change=on_change,
-        )
-
     def _set_training_strategy(self):
         st.selectbox(
             "Training Strategy",
@@ -1389,7 +1362,7 @@ def expert_mode(self):
             with tab:
                 func()
 
-    def _generate_verl_config(self, trainer_nnodes: int = 1, trainer_n_gpus_per_node: int = 8):
+    def _generate_verl_config(self):
         balance_batch = "balance_batch" in st.session_state["training_args"]
         enable_gradient_checkpointing = (
             "gradient_checkpointing" in st.session_state["training_args"]
@@ -1411,33 +1384,10 @@ def _generate_verl_config(self, trainer_nnodes: int = 1, trainer_n_gpus_per_node
             st.session_state["max_prompt_tokens"] + st.session_state["max_response_tokens"]
         )
 
-        critic_model_path = (
-            st.session_state["critic_model_path"].strip()
-            if st.session_state["critic_model_path"].strip()
-            else st.session_state["model_path"]
-        )
         trainer_config = {
-            "data": {
-                "tokenizer": None,
-                "train_files": "placeholder",
-                "val_files": "placeholder",
-                "prompt_key": "placeholder",
-                "max_prompt_length": st.session_state["max_prompt_tokens"],
-                "max_response_length": st.session_state["max_response_tokens"],
-                "train_batch_size": st.session_state["train_batch_size"]
-                * st.session_state["repeat_times"],
-                "val_batch_size": None,
-                "return_raw_input_ids": False,
-                "return_raw_chat": False,
-                "shuffle": True,
-                "filter_overlong_prompts": False,
-                "truncation": "error",
-                "image_key": "images",
-            },
             "actor_rollout_ref": {
                 "hybrid_engine": True,
                 "model": {
-                    "path": st.session_state["model_path"],
                     "external_lib": None,
                     "override_config": {},
                     "enable_gradient_checkpointing": enable_gradient_checkpointing,
@@ -1451,11 +1401,6 @@ def _generate_verl_config(self, trainer_nnodes: int = 1, trainer_n_gpus_per_node
                     ],
                     "use_dynamic_bsz": use_dynamic_bsz,
                     "ppo_max_token_len_per_gpu": ppo_max_token_len_per_gpu,
-                    "grad_clip": st.session_state["actor_grad_clip"],
-                    "clip_ratio": st.session_state["actor_clip_ratio"],
-                    "entropy_coeff": st.session_state["actor_entropy_coef"],
-                    "use_kl_loss": st.session_state["actor_use_kl_loss"],
-                    "kl_loss_coef": st.session_state["actor_kl_loss_coef"],
                     "kl_loss_type": st.session_state["actor_kl_loss_type"],
                     "ppo_epochs": st.session_state["ppo_epochs"],
                     "shuffle": False,
@@ -1490,33 +1435,39 @@ def _generate_verl_config(self, trainer_nnodes: int = 1, trainer_n_gpus_per_node
                     ],
                 },
                 "rollout": {
-                    "name": "vllm",
                     "temperature": st.session_state["temperature"],
-                    "top_k": -1,
-                    "top_p": 1,
-                    "use_fire_sampling": False,
-                    "prompt_length": st.session_state["max_prompt_tokens"],
-                    "response_length": st.session_state["max_response_tokens"],
-                    "dtype": "bfloat16",
-                    "gpu_memory_utilization": 0.4,
-                    "ignore_eos": False,
-                    "enforce_eager": True,
-                    "free_cache_engine": True,
-                    "load_format": "dummy_dtensor",
-                    "tensor_model_parallel_size": 2,
-                    "max_num_batched_tokens": 8192,
-                    "max_model_len": None,
-                    "max_num_seqs": 1024,
-                    "log_prob_micro_batch_size_per_gpu": 4,
-                    "log_prob_use_dynamic_bsz": use_dynamic_bsz,
-                    "log_prob_max_token_len_per_gpu": ppo_max_token_len_per_gpu,
-                    "disable_log_stats": True,
-                    "enable_chunked_prefill": True,
-                    "do_sample": True,
                     "n": st.session_state["repeat_times"],
                 },
             },
-            "critic": {
+            "reward_model": {
+                "enable": False,
+            },
+            "custom_reward_function": {"path": None, "name": "compute_score"},
+            "algorithm": {
+                "kl_penalty": st.session_state["kl_penalty"],
+                "kl_ctrl": {
+                    "type": st.session_state["kl_ctrl_type"],
+                    "kl_coef": st.session_state["kl_ctrl_coef"],
+                },
+            },
+            "trainer": {
+                "balance_batch": balance_batch,
+                "logger": ["tensorboard"],
+                "resume_mode": st.session_state["resume_mode"],
+                "resume_from_path": st.session_state["resume_from_path"],
+                "test_freq": 100,
+                "critic_warmup": st.session_state["critic_warmup"],
+                "default_hdfs_dir": st.session_state["default_hdfs_dir"],
+                "remove_previous_ckpt_in_save": st.session_state["remove_previous_ckpt_in_save"],
+                "del_local_ckpt_after_load": st.session_state["del_local_ckpt_after_load"],
+                "val_before_train": False,
+                "max_actor_ckpt_to_keep": st.session_state["max_actor_ckpt_to_keep"],
+                "max_critic_ckpt_to_keep": st.session_state["max_critic_ckpt_to_keep"],
+            },
+        }
+
+        if st.session_state["adv_estimator"] == AdvantageEstimator.GAE.value:
+            trainer_config["critic"] = {
                 "strategy": st.session_state["training_strategy"],
                 "optim": {
                     "lr": st.session_state["critic_lr"],
@@ -1529,8 +1480,6 @@ def _generate_verl_config(self, trainer_nnodes: int = 1, trainer_n_gpus_per_node
                     ),
                 },
                 "model": {
-                    "path": critic_model_path,
-                    "tokenizer_path": critic_model_path,
                     "override_config": {},
                     "external_lib": None,
                     "enable_gradient_checkpointing": enable_gradient_checkpointing,
@@ -1555,61 +1504,7 @@ def _generate_verl_config(self, trainer_nnodes: int = 1, trainer_n_gpus_per_node
                 "grad_clip": st.session_state["critic_grad_clip"],
                 "cliprange_value": st.session_state["critic_cliprange_value"],
                 "checkpoint": {"contents": st.session_state["critic_checkpoint"]},
-            },
-            "reward_model": {
-                "enable": False,
-                "strategy": "fsdp",
-                "model": {
-                    "input_tokenizer": st.session_state["model_path"],
-                    "path": "~/models/FsfairX-LLaMA3-RM-v0.1",
-                    "external_lib": None,
-                    "use_remove_padding": False,
-                    "fsdp_config": {
-                        "min_num_params": 0,
-                        "param_offload": False,
-                        "fsdp_size": -1,
-                    },
-                },
-                "ulysses_sequence_parallel_size": 1,
-                "use_dynamic_bsz": use_dynamic_bsz,
-                "forward_max_token_len_per_gpu": ppo_max_token_len_per_gpu * 2,
-                "reward_manager": "naive",
-            },
-            "custom_reward_function": {"path": None, "name": "compute_score"},
-            "algorithm": {
-                "gamma": st.session_state["gamma"],
-                "lam": st.session_state["lam"],
-                "adv_estimator": st.session_state["adv_estimator"],
-                "kl_penalty": st.session_state["kl_penalty"],
-                "kl_ctrl": {
-                    "type": st.session_state["kl_ctrl_type"],
-                    "kl_coef": st.session_state["kl_ctrl_coef"],
-                },
-            },
-            "trainer": {
-                "balance_batch": balance_batch,
-                "total_epochs": st.session_state["total_epochs"],
-                "project_name": st.session_state["project"],
-                "experiment_name": st.session_state["exp_name"],
-                "logger": ["tensorboard"],
-                "val_generations_to_log_to_wandb": 0,
-                "nnodes": trainer_nnodes,
-                "n_gpus_per_node": trainer_n_gpus_per_node,
-                "save_freq": st.session_state["save_interval"],
-                "resume_mode": st.session_state["resume_mode"],
-                "resume_from_path": st.session_state["resume_from_path"],
-                "test_freq": 100,
-                "critic_warmup": st.session_state["critic_warmup"],
-                "default_hdfs_dir": st.session_state["default_hdfs_dir"],
-                "remove_previous_ckpt_in_save": st.session_state["remove_previous_ckpt_in_save"],
-                "del_local_ckpt_after_load": st.session_state["del_local_ckpt_after_load"],
-                "default_local_dir": st.session_state["checkpoint_root_dir"],
-                "val_before_train": False,
-                "sync_freq": st.session_state["sync_interval"],
-                "max_actor_ckpt_to_keep": st.session_state["max_actor_ckpt_to_keep"],
-                "max_critic_ckpt_to_keep": st.session_state["max_critic_ckpt_to_keep"],
-            },
-        }
+            }
         return trainer_config
 
     def _gen_buffer_config(self):
@@ -1730,27 +1625,8 @@ def _gen_explorer_config(self):
         return explorer_config
 
     def generate_config(self):
-        if st.session_state["mode"] == "both":
-            trainer_nnodes = (
-                st.session_state["node_num"]
-                - st.session_state["engine_num"]
-                * st.session_state["tensor_parallel_size"]
-                // st.session_state["gpu_per_node"]
-            )
-        else:
-            trainer_nnodes = st.session_state["node_num"]
-        if st.session_state["node_num"] == 1 and st.session_state["mode"] == "both":
-            trainer_n_gpus_per_node = (
-                st.session_state["gpu_per_node"]
-                - st.session_state["engine_num"] * st.session_state["tensor_parallel_size"]
-            )
-        else:
-            trainer_n_gpus_per_node = st.session_state["gpu_per_node"]
-
         if st.session_state["trainer_type"] == "verl":
-            trainer_config = self._generate_verl_config(
-                trainer_nnodes=trainer_nnodes, trainer_n_gpus_per_node=trainer_n_gpus_per_node
-            )
+            trainer_config = self._generate_verl_config()
         else:
             raise ValueError(f"Invalid trainer type: {st.session_state['trainer_type']}")
 

From 48b2f8447c6df3d8c90d07e68ed0c6626853c149 Mon Sep 17 00:00:00 2001
From: chenyushuo <297086016@qq.com>
Date: Fri, 23 May 2025 14:48:20 +0800
Subject: [PATCH 3/6] remove unused args in verl config yaml

---
 .../source/tutorial/trinity_configs.md        | 103 ++-------------
 examples/async_gsm8k/verl_config.yaml         | 118 -----------------
 examples/dpo_humanlike/train_dpo.yaml         | 116 -----------------
 examples/grpo_alfworld/train_alfworld.yaml    | 119 ------------------
 examples/grpo_gsm8k/train_gsm8k.yaml          | 119 ------------------
 examples/grpo_math/train_math.yaml            | 112 -----------------
 examples/grpo_sciworld/train_sciworld.yaml    | 114 -----------------
 examples/grpo_webshop/train_webshop.yaml      | 119 ------------------
 examples/opmd_gsm8k/train_opmd_gsm8k.yaml     | 119 ------------------
 examples/ppo_countdown/train_countdown.yaml   |  86 -------------
 tests/common/vllm_test.py                     |  80 ++++++------
 tests/template/verl_config.yaml               |  46 -------
 trinity/common/config.py                      |   4 +-
 trinity/manager/config_manager.py             |  88 ++++++-------
 trinity/trainer/verl_trainer.py               |  11 --
 15 files changed, 97 insertions(+), 1257 deletions(-)

diff --git a/docs/sphinx_doc/source/tutorial/trinity_configs.md b/docs/sphinx_doc/source/tutorial/trinity_configs.md
index 255bb58018..ea31b34351 100644
--- a/docs/sphinx_doc/source/tutorial/trinity_configs.md
+++ b/docs/sphinx_doc/source/tutorial/trinity_configs.md
@@ -187,39 +187,30 @@ Support `nccl` and `checkpoint`, `nccl` represents that model weights in `explor
 ```yaml
 trainer:
   trainer_type: 'verl'
-  trainer_config_path: 'examples/ppo_countdown/train_countdown.yaml'
   save_interval: 100
+  trainer_config_path: 'examples/ppo_countdown/train_countdown.yaml'
 ```
 
 - `trainer.trainer_type`: The backend of the trainer, Only `verl` is supported.
-- `trainer.trainer_config_path`: The path to the trainer configuration file. It must be set manually.
 - `trainer.save_interval`: The interval steps between two checkpoints. Default is `100`.
 
+- `trainer.actor_grad_clip`: Gradient clip for actor model training.
+- `trainer.actor_clip_ratio`: Used for compute policy loss.
+- `trainer.actor_entropy_coeff`: Used for compute policy loss.
+- `trainer.actor_use_kl_loss`: Whether to enable kl loss.
+- `trainer.actor_kl_loss_coef`: The coefficient of kl loss.
+
+- `trainer.train_config`: The configuration of the trainer. Only one needs to be set for `trainer.trainer_config` and `trainer.trainer_config_path`
+- `trainer.trainer_config_path`: The path to the trainer configuration file. It must be set manually.
+
 ### veRL Trainer Configuration
 
 Here we mainly introduce the parameters that can be set in veRL. For the specific meaning of the parameters, please refer to the official document of [veRL](https://github.com/volcengine/verl/blob/0bdf7f469854815177e73dcfe9e420836c952e6e/docs/examples/config.rst).
 
 ```yaml
-data:
-  tokenizer: null
-  train_files: train_example.parquet
-  val_files: test_example.parquet
-  prompt_key: prompt
-  max_prompt_length: 256
-  max_response_length: 1024
-  train_batch_size: 256
-  val_batch_size: null
-  return_raw_input_ids: False  # This should be set to true when the tokenizer between policy and rm differs
-  return_raw_chat: False
-  shuffle: True
-  filter_overlong_prompts: False # for large-scale dataset, filtering overlong prompts could be timeconsuming. You should disable this and set `truncation='left'
-  truncation: error
-  image_key: images
-
 actor_rollout_ref:
   hybrid_engine: True
   model:
-    path: /PATH/TO/MODEL/CHECKPOINT/
     external_lib: null
     override_config: { }
     enable_gradient_checkpointing: True
@@ -231,11 +222,6 @@ actor_rollout_ref:
     ppo_micro_batch_size_per_gpu: 4
     use_dynamic_bsz: True
     ppo_max_token_len_per_gpu: 16384 # n * ${data.max_prompt_length} + ${data.max_response_length}
-    grad_clip: 1.0
-    clip_ratio: 0.2
-    entropy_coeff: 0.001
-    use_kl_loss: False # True for GRPO
-    kl_loss_coef: 0.001 # for grpo
     kl_loss_type: low_var_kl # for grpo
     ppo_epochs: 1
     shuffle: False
@@ -270,35 +256,6 @@ actor_rollout_ref:
     log_prob_use_dynamic_bsz: ${actor_rollout_ref.actor.use_dynamic_bsz}
     log_prob_max_token_len_per_gpu: ${actor_rollout_ref.actor.ppo_max_token_len_per_gpu}
     ulysses_sequence_parallel_size: ${actor_rollout_ref.actor.ulysses_sequence_parallel_size} # sp size
-  rollout:
-    name: vllm
-    temperature: 1.0
-    top_k: -1 # 0 for hf rollout, -1 for vllm rollout
-    top_p: 1
-    use_fire_sampling: False # https://arxiv.org/abs/2410.21236
-    prompt_length: ${data.max_prompt_length}  # not use for opensource
-    response_length: ${data.max_response_length}
-    # for vllm rollout
-    dtype: bfloat16 # should align with FSDP
-    gpu_memory_utilization: 0.4
-    ignore_eos: False
-    enforce_eager: True
-    free_cache_engine: True
-    load_format: dummy_dtensor
-    tensor_model_parallel_size: 2
-    max_num_batched_tokens: 8192
-    max_model_len: null
-    max_num_seqs: 1024
-    # log_prob_micro_batch_size: 8 # will be deprecated, use log_prob_micro_batch_size_per_gpu
-    log_prob_micro_batch_size_per_gpu: 4
-    log_prob_use_dynamic_bsz: ${actor_rollout_ref.actor.use_dynamic_bsz}
-    log_prob_max_token_len_per_gpu: ${actor_rollout_ref.actor.ppo_max_token_len_per_gpu}
-    disable_log_stats: True
-    enable_chunked_prefill: True # could get higher throughput
-    # for hf rollout
-    do_sample: True
-    # number of responses (i.e. num sample times)
-    n: 1 # > 1 for grpo
 
 critic:
   strategy: fsdp
@@ -309,8 +266,6 @@ critic:
     warmup_style: constant  # select from constant/cosine
     total_training_steps: -1  # must be override by program
   model:
-    path: /PATH/TO/MODEL/CHECKPOINT/
-    tokenizer_path: ${actor_rollout_ref.model.path}
     override_config: { }
     external_lib: ${actor_rollout_ref.model.external_lib}
     enable_gradient_checkpointing: True
@@ -323,7 +278,6 @@ critic:
         min_num_params: 0
       fsdp_size: -1
   ppo_mini_batch_size: ${actor_rollout_ref.actor.ppo_mini_batch_size}
-  # ppo_micro_batch_size: 8 # will be deprecated, use ppo_micro_batch_size_per_gpu
   ppo_micro_batch_size_per_gpu: 8
   forward_micro_batch_size_per_gpu: ${critic.ppo_micro_batch_size_per_gpu}
   use_dynamic_bsz: ${actor_rollout_ref.actor.use_dynamic_bsz}
@@ -335,34 +289,11 @@ critic:
   grad_clip: 1.0
   cliprange_value: 0.5
 
-reward_model:
-  enable: False
-  strategy: fsdp
-  model:
-    input_tokenizer: ${actor_rollout_ref.model.path}  # set this to null if the chat template is identical
-    path: ~/models/FsfairX-LLaMA3-RM-v0.1
-    external_lib: ${actor_rollout_ref.model.external_lib}
-    use_remove_padding: False
-    fsdp_config:
-      min_num_params: 0
-      param_offload: False
-      fsdp_size: -1
-  # micro_batch_size: null # will be deprecated, use micro_batch_size_per_gpu
-  # micro_batch_size_per_gpu: 2 # set a number
-  # max_length: null
-  ulysses_sequence_parallel_size: 1 # sp size
-  use_dynamic_bsz: ${critic.use_dynamic_bsz}
-  forward_max_token_len_per_gpu: ${critic.forward_max_token_len_per_gpu}
-  reward_manager: tinyzero
-
 custom_reward_function:
   path: null
   name: compute_score
 
 algorithm:
-  gamma: 1.0
-  lam: 1.0
-  adv_estimator: gae
   norm_adv_by_std_in_grpo: True
   use_kl_in_reward: False
   kl_penalty: kl  # how to estimate kl divergence
@@ -374,24 +305,15 @@ algorithm:
 
 trainer:
   balance_batch: True
-  total_epochs: 15
   # total_training_steps: null
-  project_name: TinyZero
-  experiment_name: trinity-qwen2.5-1.5b
   logger: [ 'wandb' ]
-  val_generations_to_log_to_wandb: 0
-  nnodes: 1
-  n_gpus_per_node: 2
-  save_freq: 100
   # auto: find the last ckpt to resume. If can't find, start from scratch
   resume_mode: auto # or auto or resume_path if
   resume_from_path: ""
-  test_freq: 100
   critic_warmup: 0
   default_hdfs_dir: null
   remove_previous_ckpt_in_save: False
   del_local_ckpt_after_load: False
-  default_local_dir: checkpoints/${trainer.project_name}/${trainer.experiment_name}
   val_before_train: False
   max_actor_ckpt_to_keep: 5
   max_critic_ckpt_to_keep: 5
@@ -402,11 +324,6 @@ trainer:
 - `actor_rollout_ref.model.use_remove_padding`: Whether to remove pad tokens, which will reduce training time.
 - `actor_rollout_ref.actor.use_dynamic_bsz`: Whether to reorganize the batch data, specifically to splice the shorter data to reduce the batch size in the actual training process.
 - `actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu`: Batch size for one GPU in one forward pass.
-- `actor_rollout_ref.actor.grad_clip`: Gradient clip for actor model training.
-- `actor_rollout_ref.actor.clip_ratio`: Used for compute policy loss.
-- `actor_rollout_ref.actor.entropy_coeff`: Used for compute policy loss.
-- `actor_rollout_ref.actor.use_kl_loss`: Whether to enable kl loss.
-- `actor_rollout_ref.actor.kl_loss_coef`: The coefficient of kl loss.
 - `actor_rollout_ref.actor.kl_loss_type`: How to compute kl loss, optional value is `kl`, `abs`, `mse` or `low_var_kl`.
 - `actor_rollout_ref.actor.ulysses_sequence_parallel_size`: Ulysses sequence parallel size.
 - `actor_rollout_ref.actor.tau`: strength of regularization w.r.t. old / ref policy.
diff --git a/examples/async_gsm8k/verl_config.yaml b/examples/async_gsm8k/verl_config.yaml
index 268d61e0e5..18d3060847 100644
--- a/examples/async_gsm8k/verl_config.yaml
+++ b/examples/async_gsm8k/verl_config.yaml
@@ -1,23 +1,6 @@
-data:
-  tokenizer: null
-  train_files: placeholder
-  val_files: placeholder
-  prompt_key: prompt
-  max_prompt_length: 256
-  max_response_length: 1024
-  train_batch_size: 256
-  val_batch_size: null
-  return_raw_input_ids: False  # This should be set to true when the tokenizer between policy and rm differs
-  return_raw_chat: False
-  shuffle: True
-  filter_overlong_prompts: False # for large-scale dataset, filtering overlong prompts could be timeconsuming. You should disable this and set `truncation='left'
-  truncation: error
-  image_key: images
-
 actor_rollout_ref:
   hybrid_engine: True
   model:
-    path: /PATH/TO/MODEL/
     external_lib: null
     override_config: { }
     enable_gradient_checkpointing: True
@@ -25,15 +8,9 @@ actor_rollout_ref:
   actor:
     strategy: fsdp  # This is for backward-compatibility
     ppo_mini_batch_size: 128
-    # ppo_micro_batch_size: 8 # will be deprecated, use ppo_micro_batch_size_per_gpu
     ppo_micro_batch_size_per_gpu: 4
     use_dynamic_bsz: True # False
     ppo_max_token_len_per_gpu: 16384 # n * ${data.max_prompt_length} + ${data.max_response_length}
-    grad_clip: 1.0
-    clip_ratio: 0.2
-    entropy_coeff: 0.001
-    use_kl_loss: True # True for GRPO
-    kl_loss_coef: 0.001 # for grpo
     kl_loss_type: low_var_kl # for grpo
     ppo_epochs: 1
     shuffle: False
@@ -61,101 +38,16 @@ actor_rollout_ref:
       wrap_policy:
         # transformer_layer_cls_to_wrap: None
         min_num_params: 0
-    # log_prob_micro_batch_size: 4 # will be deprecated, use log_prob_micro_batch_size_per_gpu
     log_prob_micro_batch_size_per_gpu: 16
     log_prob_use_dynamic_bsz: ${actor_rollout_ref.actor.use_dynamic_bsz}
     log_prob_max_token_len_per_gpu: ${actor_rollout_ref.actor.ppo_max_token_len_per_gpu}
     ulysses_sequence_parallel_size: ${actor_rollout_ref.actor.ulysses_sequence_parallel_size} # sp size
-  rollout:
-    name: vllm
-    temperature: 1.0
-    use_fire_sampling: False # https://arxiv.org/abs/2410.21236
-    prompt_length: ${data.max_prompt_length}  # not use for opensource
-    response_length: ${data.max_response_length}
-    # for vllm rollout
-    dtype: bfloat16 # should align with FSDP
-    gpu_memory_utilization: 0.4
-    ignore_eos: False
-    enforce_eager: True
-    free_cache_engine: True
-    load_format: dummy_dtensor
-    tensor_model_parallel_size: 2
-    max_num_batched_tokens: 8192
-    max_model_len: null
-    max_num_seqs: 1024
-    # log_prob_micro_batch_size: 8 # will be deprecated, use log_prob_micro_batch_size_per_gpu
-    log_prob_micro_batch_size_per_gpu: 4
-    log_prob_use_dynamic_bsz: ${actor_rollout_ref.actor.use_dynamic_bsz}
-    log_prob_max_token_len_per_gpu: ${actor_rollout_ref.actor.ppo_max_token_len_per_gpu}
-    disable_log_stats: True
-    enable_chunked_prefill: True # could get higher throughput
-    # for hf rollout
-    do_sample: True
-    # number of responses (i.e. num sample times)
-    n: 8 # > 1 for grpo
-
-critic:
-  strategy: fsdp
-  optim:
-    lr: 1e-5
-    lr_warmup_steps_ratio: 0.  # the total steps will be injected during runtime
-    # min_lr_ratio: null   # only useful for warmup with cosine
-    warmup_style: constant  # select from constant/cosine
-    total_training_steps: -1  # must be override by program
-  model:
-    path: /PATH/TO/MODEL/
-    tokenizer_path: ${actor_rollout_ref.model.path}
-    override_config: { }
-    external_lib: ${actor_rollout_ref.model.external_lib}
-    enable_gradient_checkpointing: True
-    use_remove_padding: False
-    fsdp_config:
-      param_offload: False
-      optimizer_offload: False
-      wrap_policy:
-        # transformer_layer_cls_to_wrap: None
-        min_num_params: 0
-      fsdp_size: -1
-  ppo_mini_batch_size: ${actor_rollout_ref.actor.ppo_mini_batch_size}
-  # ppo_micro_batch_size: 8 # will be deprecated, use ppo_micro_batch_size_per_gpu
-  ppo_micro_batch_size_per_gpu: 64
-  forward_micro_batch_size_per_gpu: ${critic.ppo_micro_batch_size_per_gpu}
-  use_dynamic_bsz: ${actor_rollout_ref.actor.use_dynamic_bsz}
-  ppo_max_token_len_per_gpu: 32768 # (${actor_rollout_ref.actor.ppo_max_token_len_per_gpu}) * 2
-  forward_max_token_len_per_gpu: ${critic.ppo_max_token_len_per_gpu}
-  ulysses_sequence_parallel_size: 1 # sp size
-  ppo_epochs: ${actor_rollout_ref.actor.ppo_epochs}
-  shuffle: ${actor_rollout_ref.actor.shuffle}
-  grad_clip: 1.0
-  cliprange_value: 0.5
-
-reward_model:
-  enable: False
-  strategy: fsdp
-  model:
-    input_tokenizer: ${actor_rollout_ref.model.path}  # set this to null if the chat template is identical
-    path: ~/models/FsfairX-LLaMA3-RM-v0.1
-    external_lib: ${actor_rollout_ref.model.external_lib}
-    use_remove_padding: False
-    fsdp_config:
-      min_num_params: 0
-      param_offload: False
-      fsdp_size: -1
-  # micro_batch_size: null # will be deprecated, use micro_batch_size_per_gpu
-  # micro_batch_size_per_gpu: 2 # set a number
-  # max_length: null
-  ulysses_sequence_parallel_size: 1 # sp size
-  use_dynamic_bsz: ${critic.use_dynamic_bsz}
-  forward_max_token_len_per_gpu: ${critic.forward_max_token_len_per_gpu}
 
 custom_reward_function:
   path: null
   name: compute_score
 
 algorithm:
-  gamma: 1.0
-  lam: 1.0
-  adv_estimator: grpo
   kl_penalty: kl  # how to estimate kl divergence
   kl_ctrl:
     type: fixed
@@ -163,21 +55,11 @@ algorithm:
 
 trainer:
   balance_batch: True
-  total_epochs: 10
   # total_training_steps: null
-  project_name: rft_example_gsm8k
-  experiment_name: cys-qwen2_1.5b_rollout8_grpo_kl0.001_lr1e-5
   logger: [ 'console','wandb' ]
-  val_generations_to_log_to_wandb: 0
-  nnodes: 1
-  n_gpus_per_node: 2
-  save_freq: 100
   # auto: find the last ckpt to resume. If can't find, start from scratch
   resume_mode: auto # or auto or resume_path if
-  test_freq: 5
-  critic_warmup: 0
   default_hdfs_dir: null
   remove_previous_ckpt_in_save: False
   del_local_ckpt_after_load: False
-  default_local_dir: checkpoints/${trainer.project_name}/${trainer.experiment_name}
   val_before_train: False
diff --git a/examples/dpo_humanlike/train_dpo.yaml b/examples/dpo_humanlike/train_dpo.yaml
index 09327877f9..ccad2baa37 100644
--- a/examples/dpo_humanlike/train_dpo.yaml
+++ b/examples/dpo_humanlike/train_dpo.yaml
@@ -1,23 +1,6 @@
-data:
-  tokenizer: null
-  train_files: /train.parquet  # useless
-  val_files: /test.parquet # useless
-  prompt_key: prompt
-  max_prompt_length: 1792
-  max_response_length: 256
-  train_batch_size: 32
-  val_batch_size: null
-  return_raw_input_ids: False  # This should be set to true when the tokenizer between policy and rm differs
-  return_raw_chat: False
-  shuffle: True
-  filter_overlong_prompts: False # for large-scale dataset, filtering overlong prompts could be timeconsuming. You should disable this and set `truncation='left'
-  truncation: error
-  image_key: images
-
 actor_rollout_ref:
   hybrid_engine: True
   model:
-    path: /PATH/TO/MODEL/CHECKPOINT/
     external_lib: null
     override_config: { }
     enable_gradient_checkpointing: True
@@ -25,15 +8,9 @@ actor_rollout_ref:
   actor:
     strategy: fsdp  # This is for backward-compatibility
     ppo_mini_batch_size: 32
-    # ppo_micro_batch_size: 8 # will be deprecated, use ppo_micro_batch_size_per_gpu
     ppo_micro_batch_size_per_gpu: 2 # NOTE
     use_dynamic_bsz: False
     ppo_max_token_len_per_gpu: 16384 # n * ${data.max_prompt_length} + ${data.max_response_length}
-    grad_clip: 1.0
-    clip_ratio: 0.2
-    entropy_coeff: 0.001
-    use_kl_loss: True
-    kl_loss_coef: 0.1 # NOTE: beta for DPO
     kl_loss_type: low_var_kl # for grpo
     ppo_epochs: 1
     shuffle: False
@@ -64,96 +41,12 @@ actor_rollout_ref:
     log_prob_use_dynamic_bsz: ${actor_rollout_ref.actor.use_dynamic_bsz}
     log_prob_max_token_len_per_gpu: ${actor_rollout_ref.actor.ppo_max_token_len_per_gpu}
     ulysses_sequence_parallel_size: ${actor_rollout_ref.actor.ulysses_sequence_parallel_size} # sp size
-  rollout:
-    name: vllm
-    temperature: 1.0
-    top_k: -1 # 0 for hf rollout, -1 for vllm rollout
-    top_p: 1
-    use_fire_sampling: False # https://arxiv.org/abs/2410.21236
-    prompt_length: ${data.max_prompt_length}  # not use for opensource
-    response_length: ${data.max_response_length}
-    # for vllm rollout
-    dtype: bfloat16 # should align with FSDP
-    gpu_memory_utilization: 0.4
-    ignore_eos: False
-    enforce_eager: True
-    free_cache_engine: True
-    load_format: dummy_dtensor
-    tensor_model_parallel_size: 2
-    max_num_batched_tokens: 8192
-    max_model_len: null
-    max_num_seqs: 1024
-    # log_prob_micro_batch_size: 8 # will be deprecated, use log_prob_micro_batch_size_per_gpu
-    log_prob_micro_batch_size_per_gpu: 4
-    log_prob_use_dynamic_bsz: ${actor_rollout_ref.actor.use_dynamic_bsz}
-    log_prob_max_token_len_per_gpu: ${actor_rollout_ref.actor.ppo_max_token_len_per_gpu}
-    disable_log_stats: True
-    enable_chunked_prefill: True # could get higher throughput
-    # for hf rollout
-    do_sample: True
-
-critic:
-  strategy: fsdp
-  optim:
-    lr: 1e-5
-    lr_warmup_steps_ratio: 0.  # the total steps will be injected during runtime
-    # min_lr_ratio: null   # only useful for warmup with cosine
-    warmup_style: constant  # select from constant/cosine
-    total_training_steps: 783  # must be override by program
-  model:
-    path: /PATH/TO/MODEL/CHECKPOINT/
-    tokenizer_path: ${actor_rollout_ref.model.path}
-    override_config: { }
-    external_lib: ${actor_rollout_ref.model.external_lib}
-    enable_gradient_checkpointing: True
-    use_remove_padding: False
-    fsdp_config:
-      param_offload: False
-      optimizer_offload: False
-      wrap_policy:
-        # transformer_layer_cls_to_wrap: None
-        min_num_params: 0
-      fsdp_size: -1
-  ppo_mini_batch_size: ${actor_rollout_ref.actor.ppo_mini_batch_size}
-  # ppo_micro_batch_size: 8 # will be deprecated, use ppo_micro_batch_size_per_gpu
-  ppo_micro_batch_size_per_gpu: 1
-  forward_micro_batch_size_per_gpu: ${critic.ppo_micro_batch_size_per_gpu}
-  use_dynamic_bsz: ${actor_rollout_ref.actor.use_dynamic_bsz}
-  ppo_max_token_len_per_gpu: 32768 # (${actor_rollout_ref.actor.ppo_max_token_len_per_gpu}) * 2
-  forward_max_token_len_per_gpu: ${critic.ppo_max_token_len_per_gpu}
-  ulysses_sequence_parallel_size: 1 # sp size
-  ppo_epochs: ${actor_rollout_ref.actor.ppo_epochs}
-  shuffle: ${actor_rollout_ref.actor.shuffle}
-  grad_clip: 1.0
-  cliprange_value: 0.5
-
-reward_model:
-  enable: False
-  strategy: fsdp
-  model:
-    input_tokenizer: ${actor_rollout_ref.model.path}  # set this to null if the chat template is identical
-    path: ~/models/FsfairX-LLaMA3-RM-v0.1
-    external_lib: ${actor_rollout_ref.model.external_lib}
-    use_remove_padding: False
-    fsdp_config:
-      min_num_params: 0
-      param_offload: False
-      fsdp_size: -1
-  # micro_batch_size: null # will be deprecated, use micro_batch_size_per_gpu
-  # micro_batch_size_per_gpu: 2 # set a number
-  # max_length: null
-  ulysses_sequence_parallel_size: 1 # sp size
-  use_dynamic_bsz: ${critic.use_dynamic_bsz}
-  forward_max_token_len_per_gpu: ${critic.forward_max_token_len_per_gpu}
 
 custom_reward_function:
   path: null
   name: compute_score
 
 algorithm:
-  gamma: 1.0
-  lam: 1.0
-  adv_estimator: grpo
   kl_penalty: kl
   kl_ctrl:
     type: fixed
@@ -161,20 +54,11 @@ algorithm:
 
 trainer:
   balance_batch: False
-  total_epochs: 1  #
   total_training_steps: 783 #
-  project_name: dpo_example
-  experiment_name: trinity_dpo
   logger: [ 'console','wandb' ]
-  val_generations_to_log_to_wandb: 0
-  nnodes: 1
-  n_gpus_per_node: 2
   # auto: find the last ckpt to resume. If can't find, start from scratch
   resume_mode: auto # or auto or resume_path if
-  test_freq: 5
-  critic_warmup: 0
   default_hdfs_dir: null
   remove_previous_ckpt_in_save: False
   del_local_ckpt_after_load: False
-  default_local_dir: checkpoints/${trainer.project_name}/${trainer.experiment_name}
   val_before_train: False
diff --git a/examples/grpo_alfworld/train_alfworld.yaml b/examples/grpo_alfworld/train_alfworld.yaml
index a210c39916..169b0fe020 100644
--- a/examples/grpo_alfworld/train_alfworld.yaml
+++ b/examples/grpo_alfworld/train_alfworld.yaml
@@ -1,23 +1,6 @@
-data:
-  tokenizer: null
-  train_files: train_example.parquet
-  val_files: test_example.parquet
-  prompt_key: prompt
-  max_prompt_length: 4096
-  max_response_length: 16384
-  train_batch_size: 96
-  val_batch_size: null
-  return_raw_input_ids: False  # This should be set to true when the tokenizer between policy and rm differs
-  return_raw_chat: False
-  shuffle: True
-  filter_overlong_prompts: False # for large-scale dataset, filtering overlong prompts could be timeconsuming. You should disable this and set `truncation='left'
-  truncation: error
-  image_key: images
-
 actor_rollout_ref:
   hybrid_engine: True
   model:
-    path: /PATH/TO/MODEL/CHECKPOINT/
     external_lib: null
     override_config: { }
     enable_gradient_checkpointing: True
@@ -25,15 +8,9 @@ actor_rollout_ref:
   actor:
     strategy: fsdp  # This is for backward-compatibility
     ppo_mini_batch_size: 1536
-    # ppo_micro_batch_size: 8 # will be deprecated, use ppo_micro_batch_size_per_gpu
     ppo_micro_batch_size_per_gpu: 1
     use_dynamic_bsz: False
     ppo_max_token_len_per_gpu: 16384 # n * ${data.max_prompt_length} + ${data.max_response_length}
-    grad_clip: 1.0
-    clip_ratio: 0.2
-    entropy_coeff: 0.001
-    use_kl_loss: True # True for GRPO
-    kl_loss_coef: 0.001 # for grpo
     kl_loss_type: low_var_kl # for grpo
     ppo_epochs: 1
     shuffle: False
@@ -57,103 +34,16 @@ actor_rollout_ref:
       wrap_policy:
         # transformer_layer_cls_to_wrap: None
         min_num_params: 0
-    # log_prob_micro_batch_size: 4 # will be deprecated, use log_prob_micro_batch_size_per_gpu
     log_prob_micro_batch_size_per_gpu: 1
     log_prob_use_dynamic_bsz: ${actor_rollout_ref.actor.use_dynamic_bsz}
     log_prob_max_token_len_per_gpu: ${actor_rollout_ref.actor.ppo_max_token_len_per_gpu}
     ulysses_sequence_parallel_size: ${actor_rollout_ref.actor.ulysses_sequence_parallel_size} # sp size
-  rollout:
-    name: vllm
-    temperature: 1.0
-    top_k: -1 # 0 for hf rollout, -1 for vllm rollout
-    top_p: 1
-    use_fire_sampling: False # https://arxiv.org/abs/2410.21236
-    prompt_length: ${data.max_prompt_length}  # not use for opensource
-    response_length: ${data.max_response_length}
-    # for vllm rollout
-    dtype: bfloat16 # should align with FSDP
-    gpu_memory_utilization: 0.4
-    ignore_eos: False
-    enforce_eager: True
-    free_cache_engine: True
-    load_format: dummy_dtensor
-    tensor_model_parallel_size: 1
-    max_num_batched_tokens: 8192
-    max_model_len: null
-    max_num_seqs: 1024
-    # log_prob_micro_batch_size: 8 # will be deprecated, use log_prob_micro_batch_size_per_gpu
-    log_prob_micro_batch_size_per_gpu: 1
-    log_prob_use_dynamic_bsz: ${actor_rollout_ref.actor.use_dynamic_bsz}
-    log_prob_max_token_len_per_gpu: ${actor_rollout_ref.actor.ppo_max_token_len_per_gpu}
-    disable_log_stats: True
-    enable_chunked_prefill: True # could get higher throughput
-    # for hf rollout
-    do_sample: True
-    # number of responses (i.e. num sample times)
-    n: 8 # should be > 1 for grpo; Currently is unused parameter
-
-critic:
-  strategy: fsdp
-  optim:
-    lr: 1e-5
-    lr_warmup_steps_ratio: 0.  # the total steps will be injected during runtime
-    # min_lr_ratio: null   # only useful for warmup with cosine
-    warmup_style: constant  # select from constant/cosine
-    total_training_steps: -1  # must be override by program
-  model:
-    path: /PATH/TO/MODEL/CHECKPOINT/
-    tokenizer_path: ${actor_rollout_ref.model.path}
-    override_config: { }
-    external_lib: ${actor_rollout_ref.model.external_lib}
-    enable_gradient_checkpointing: True
-    use_remove_padding: False
-    fsdp_config:
-      param_offload: False
-      optimizer_offload: False
-      wrap_policy:
-        # transformer_layer_cls_to_wrap: None
-        min_num_params: 0
-      fsdp_size: -1
-  ppo_mini_batch_size: ${actor_rollout_ref.actor.ppo_mini_batch_size}
-  # ppo_micro_batch_size: 8 # will be deprecated, use ppo_micro_batch_size_per_gpu
-  ppo_micro_batch_size_per_gpu: 1
-  forward_micro_batch_size_per_gpu: ${critic.ppo_micro_batch_size_per_gpu}
-  use_dynamic_bsz: ${actor_rollout_ref.actor.use_dynamic_bsz}
-  ppo_max_token_len_per_gpu: 16384 # (${actor_rollout_ref.actor.ppo_max_token_len_per_gpu}) * 2
-  forward_max_token_len_per_gpu: ${critic.ppo_max_token_len_per_gpu}
-  ulysses_sequence_parallel_size: 1 # sp size
-  ppo_epochs: ${actor_rollout_ref.actor.ppo_epochs}
-  shuffle: ${actor_rollout_ref.actor.shuffle}
-  grad_clip: 1.0
-  cliprange_value: 0.5
-
-reward_model:
-  enable: False
-  strategy: fsdp
-  model:
-    input_tokenizer: ${actor_rollout_ref.model.path}  # set this to null if the chat template is identical
-    path: ~/models/FsfairX-LLaMA3-RM-v0.1
-    external_lib: ${actor_rollout_ref.model.external_lib}
-    use_remove_padding: False
-    fsdp_config:
-      min_num_params: 0
-      param_offload: False
-      fsdp_size: -1
-  # micro_batch_size: null # will be deprecated, use micro_batch_size_per_gpu
-  # micro_batch_size_per_gpu: 2 # set a number
-  # max_length: null
-  ulysses_sequence_parallel_size: 1 # sp size
-  use_dynamic_bsz: ${critic.use_dynamic_bsz}
-  forward_max_token_len_per_gpu: ${critic.forward_max_token_len_per_gpu}
 
 custom_reward_function:
   path: null
   name: compute_score
 
 algorithm:
-  gamma: 1.0
-  lam: 1.0
-  adv_estimator: grpo
   kl_penalty: kl  # how to estimate kl divergence
   kl_ctrl:
     type: fixed
@@ -161,20 +51,11 @@ algorithm:
 
 trainer:
   balance_batch: True
-  total_epochs: 15
   # total_training_steps: null
-  project_name: ALFWORLD
-  experiment_name: ALFWORLD_RFT
   logger: [ 'wandb' ]
-  val_generations_to_log_to_wandb: 0
-  nnodes: 1
-  n_gpus_per_node: 2
   # auto: find the last ckpt to resume. If can't find, start from scratch
   resume_mode: auto # or auto or resume_path if
-  test_freq: 100
-  critic_warmup: 0
   default_hdfs_dir: null
   remove_previous_ckpt_in_save: False
   del_local_ckpt_after_load: False
-  default_local_dir: checkpoints/${trainer.project_name}/${trainer.experiment_name}
   val_before_train: False
diff --git a/examples/grpo_gsm8k/train_gsm8k.yaml b/examples/grpo_gsm8k/train_gsm8k.yaml
index 13b195f557..18d3060847 100644
--- a/examples/grpo_gsm8k/train_gsm8k.yaml
+++ b/examples/grpo_gsm8k/train_gsm8k.yaml
@@ -1,23 +1,6 @@
-data:
-  tokenizer: null
-  train_files: train_example.parquet
-  val_files: test_example.parquet
-  prompt_key: prompt
-  max_prompt_length: 256
-  max_response_length: 1024
-  train_batch_size: 256
-  val_batch_size: null
-  return_raw_input_ids: False  # This should be set to true when the tokenizer between policy and rm differs
-  return_raw_chat: False
-  shuffle: True
-  filter_overlong_prompts: False # for large-scale dataset, filtering overlong prompts could be timeconsuming. You should disable this and set `truncation='left'
-  truncation: error
-  image_key: images
-
 actor_rollout_ref:
   hybrid_engine: True
   model:
-    path: /PATH/TO/MODEL/CHECKPOINT/
     external_lib: null
     override_config: { }
     enable_gradient_checkpointing: True
@@ -25,15 +8,9 @@ actor_rollout_ref:
   actor:
     strategy: fsdp  # This is for backward-compatibility
     ppo_mini_batch_size: 128
-    # ppo_micro_batch_size: 8 # will be deprecated, use ppo_micro_batch_size_per_gpu
     ppo_micro_batch_size_per_gpu: 4
     use_dynamic_bsz: True # False
     ppo_max_token_len_per_gpu: 16384 # n * ${data.max_prompt_length} + ${data.max_response_length}
-    grad_clip: 1.0
-    clip_ratio: 0.2
-    entropy_coeff: 0.001
-    use_kl_loss: True # True for GRPO
-    kl_loss_coef: 0.001 # for grpo
     kl_loss_type: low_var_kl # for grpo
     ppo_epochs: 1
     shuffle: False
@@ -61,103 +38,16 @@ actor_rollout_ref:
       wrap_policy:
         # transformer_layer_cls_to_wrap: None
         min_num_params: 0
-    # log_prob_micro_batch_size: 4 # will be deprecated, use log_prob_micro_batch_size_per_gpu
     log_prob_micro_batch_size_per_gpu: 16
     log_prob_use_dynamic_bsz: ${actor_rollout_ref.actor.use_dynamic_bsz}
     log_prob_max_token_len_per_gpu: ${actor_rollout_ref.actor.ppo_max_token_len_per_gpu}
     ulysses_sequence_parallel_size: ${actor_rollout_ref.actor.ulysses_sequence_parallel_size} # sp size
-  rollout:
-    name: vllm
-    temperature: 1.0
-    top_k: -1 # 0 for hf rollout, -1 for vllm rollout
-    top_p: 1
-    use_fire_sampling: False # https://arxiv.org/abs/2410.21236
-    prompt_length: ${data.max_prompt_length}  # not use for opensource
-    response_length: ${data.max_response_length}
-    # for vllm rollout
-    dtype: bfloat16 # should align with FSDP
-    gpu_memory_utilization: 0.4
-    ignore_eos: False
-    enforce_eager: True
-    free_cache_engine: True
-    load_format: dummy_dtensor
-    tensor_model_parallel_size: 2
-    max_num_batched_tokens: 8192
-    max_model_len: null
-    max_num_seqs: 1024
-    # log_prob_micro_batch_size: 8 # will be deprecated, use log_prob_micro_batch_size_per_gpu
-    log_prob_micro_batch_size_per_gpu: 4
-    log_prob_use_dynamic_bsz: ${actor_rollout_ref.actor.use_dynamic_bsz}
-    log_prob_max_token_len_per_gpu: ${actor_rollout_ref.actor.ppo_max_token_len_per_gpu}
-    disable_log_stats: True
-    enable_chunked_prefill: True # could get higher throughput
-    # for hf rollout
-    do_sample: True
-    # number of responses (i.e. num sample times)
-    n: 8 # > 1 for grpo
-
-critic:
-  strategy: fsdp
-  optim:
-    lr: 1e-5
-    lr_warmup_steps_ratio: 0.  # the total steps will be injected during runtime
-    # min_lr_ratio: null   # only useful for warmup with cosine
-    warmup_style: constant  # select from constant/cosine
-    total_training_steps: -1  # must be override by program
-  model:
-    path: /PATH/TO/MODEL/CHECKPOINT/
-    tokenizer_path: ${actor_rollout_ref.model.path}
-    override_config: { }
-    external_lib: ${actor_rollout_ref.model.external_lib}
-    enable_gradient_checkpointing: True
-    use_remove_padding: False
-    fsdp_config:
-      param_offload: False
-      optimizer_offload: False
-      wrap_policy:
-        # transformer_layer_cls_to_wrap: None
-        min_num_params: 0
-      fsdp_size: -1
-  ppo_mini_batch_size: ${actor_rollout_ref.actor.ppo_mini_batch_size}
-  # ppo_micro_batch_size: 8 # will be deprecated, use ppo_micro_batch_size_per_gpu
-  ppo_micro_batch_size_per_gpu: 64
-  forward_micro_batch_size_per_gpu: ${critic.ppo_micro_batch_size_per_gpu}
-  use_dynamic_bsz: ${actor_rollout_ref.actor.use_dynamic_bsz}
-  ppo_max_token_len_per_gpu: 32768 # (${actor_rollout_ref.actor.ppo_max_token_len_per_gpu}) * 2
-  forward_max_token_len_per_gpu: ${critic.ppo_max_token_len_per_gpu}
-  ulysses_sequence_parallel_size: 1 # sp size
-  ppo_epochs: ${actor_rollout_ref.actor.ppo_epochs}
-  shuffle: ${actor_rollout_ref.actor.shuffle}
-  grad_clip: 1.0
-  cliprange_value: 0.5
-
-reward_model:
-  enable: False
-  strategy: fsdp
-  model:
-    input_tokenizer: ${actor_rollout_ref.model.path}  # set this to null if the chat template is identical
-    path: ~/models/FsfairX-LLaMA3-RM-v0.1
-    external_lib: ${actor_rollout_ref.model.external_lib}
-    use_remove_padding: False
-    fsdp_config:
-      min_num_params: 0
-      param_offload: False
-      fsdp_size: -1
-  # micro_batch_size: null # will be deprecated, use micro_batch_size_per_gpu
-  # micro_batch_size_per_gpu: 2 # set a number
-  # max_length: null
-  ulysses_sequence_parallel_size: 1 # sp size
-  use_dynamic_bsz: ${critic.use_dynamic_bsz}
-  forward_max_token_len_per_gpu: ${critic.forward_max_token_len_per_gpu}
 
 custom_reward_function:
   path: null
   name: compute_score
 
 algorithm:
-  gamma: 1.0
-  lam: 1.0
-  adv_estimator: grpo
   kl_penalty: kl  # how to estimate kl divergence
   kl_ctrl:
     type: fixed
@@ -165,20 +55,11 @@ algorithm:
 
 trainer:
   balance_batch: True
-  total_epochs: 10
   # total_training_steps: null
-  project_name: rft_example_gsm8k
-  experiment_name: cys-qwen2_1.5b_rollout8_grpo_kl0.001_lr1e-5
   logger: [ 'console','wandb' ]
-  val_generations_to_log_to_wandb: 0
-  nnodes: 1
-  n_gpus_per_node: 2
   # auto: find the last ckpt to resume. If can't find, start from scratch
   resume_mode: auto # or auto or resume_path if
-  test_freq: 5
-  critic_warmup: 0
   default_hdfs_dir: null
   remove_previous_ckpt_in_save: False
   del_local_ckpt_after_load: False
-  default_local_dir: checkpoints/${trainer.project_name}/${trainer.experiment_name}
   val_before_train: False
diff --git a/examples/grpo_math/train_math.yaml b/examples/grpo_math/train_math.yaml
index 2482ccc785..b783445bf7 100644
--- a/examples/grpo_math/train_math.yaml
+++ b/examples/grpo_math/train_math.yaml
@@ -1,23 +1,6 @@
-data:
-  tokenizer: null
-  train_files: train_example.parquet
-  val_files: test_example.parquet
-  prompt_key: prompt
-  max_prompt_length: 1024
-  max_response_length: 2048
-  # train_batch_size: 256
-  val_batch_size: null
-  return_raw_input_ids: False  # This should be set to true when the tokenizer between policy and rm differs
-  return_raw_chat: False
-  shuffle: True
-  filter_overlong_prompts: False # for large-scale dataset, filtering overlong prompts could be timeconsuming. You should disable this and set `truncation='left'
-  truncation: error
-  image_key: images
-
 actor_rollout_ref:
   hybrid_engine: True
   model:
-    path: /PATH/TO/MODEL/
     external_lib: null
     override_config: { }
     enable_gradient_checkpointing: True
@@ -28,11 +11,6 @@ actor_rollout_ref:
     ppo_micro_batch_size_per_gpu: 4
     use_dynamic_bsz: True # False
     ppo_max_token_len_per_gpu: 16384 # n * ${data.max_prompt_length} + ${data.max_response_length}
-    grad_clip: 1.0
-    clip_ratio: 0.2
-    entropy_coeff: 0.001
-    use_kl_loss: True # True for GRPO
-    kl_loss_coef: 0.0001 # for grpo
     kl_loss_type: low_var_kl # for grpo
     ppo_epochs: 1
     shuffle: False
@@ -64,93 +42,12 @@ actor_rollout_ref:
     log_prob_use_dynamic_bsz: ${actor_rollout_ref.actor.use_dynamic_bsz}
     log_prob_max_token_len_per_gpu: ${actor_rollout_ref.actor.ppo_max_token_len_per_gpu}
     ulysses_sequence_parallel_size: ${actor_rollout_ref.actor.ulysses_sequence_parallel_size} # sp size
-  rollout:
-    name: vllm
-    temperature: 1.0
-    top_k: -1 # 0 for hf rollout, -1 for vllm rollout
-    top_p: 1
-    use_fire_sampling: False # https://arxiv.org/abs/2410.21236
-    prompt_length: ${data.max_prompt_length}  # not use for opensource
-    response_length: ${data.max_response_length}
-    # for vllm rollout
-    dtype: bfloat16 # should align with FSDP
-    gpu_memory_utilization: 0.4
-    ignore_eos: False
-    enforce_eager: True
-    free_cache_engine: True
-    load_format: dummy_dtensor
-    tensor_model_parallel_size: 2
-    max_num_batched_tokens: 8192
-    max_model_len: null
-    max_num_seqs: 1024
-    log_prob_micro_batch_size_per_gpu: 4
-    log_prob_use_dynamic_bsz: ${actor_rollout_ref.actor.use_dynamic_bsz}
-    log_prob_max_token_len_per_gpu: ${actor_rollout_ref.actor.ppo_max_token_len_per_gpu}
-    disable_log_stats: True
-    enable_chunked_prefill: True # could get higher throughput
-    # for hf rollout
-    do_sample: True
-    # number of responses (i.e. num sample times)
-    n: 8 # > 1 for grpo
-
-critic:
-  strategy: fsdp
-  optim:
-    lr: 1e-5
-    lr_warmup_steps_ratio: 0.  # the total steps will be injected during runtime
-    # min_lr_ratio: null   # only useful for warmup with cosine
-    warmup_style: constant  # select from constant/cosine
-    total_training_steps: -1  # must be override by program
-  model:
-    path: /PATH/TO/MODEL/
-    tokenizer_path: ${actor_rollout_ref.model.path}
-    override_config: { }
-    external_lib: ${actor_rollout_ref.model.external_lib}
-    enable_gradient_checkpointing: True
-    use_remove_padding: False
-    fsdp_config:
-      param_offload: False
-      optimizer_offload: False
-      wrap_policy:
-        # transformer_layer_cls_to_wrap: None
-        min_num_params: 0
-      fsdp_size: -1
-  ppo_mini_batch_size: ${actor_rollout_ref.actor.ppo_mini_batch_size}
-  ppo_micro_batch_size_per_gpu: 64
-  forward_micro_batch_size_per_gpu: ${critic.ppo_micro_batch_size_per_gpu}
-  use_dynamic_bsz: ${actor_rollout_ref.actor.use_dynamic_bsz}
-  ppo_max_token_len_per_gpu: 32768 # (${actor_rollout_ref.actor.ppo_max_token_len_per_gpu}) * 2
-  forward_max_token_len_per_gpu: ${critic.ppo_max_token_len_per_gpu}
-  ulysses_sequence_parallel_size: 1 # sp size
-  ppo_epochs: ${actor_rollout_ref.actor.ppo_epochs}
-  shuffle: ${actor_rollout_ref.actor.shuffle}
-  grad_clip: 1.0
-  cliprange_value: 0.5
-
-reward_model:
-  enable: False
-  strategy: fsdp
-  model:
-    input_tokenizer: ${actor_rollout_ref.model.path}  # set this to null if the chat template is identical
-    path: ~/models/FsfairX-LLaMA3-RM-v0.1
-    external_lib: ${actor_rollout_ref.model.external_lib}
-    use_remove_padding: False
-    fsdp_config:
-      min_num_params: 0
-      param_offload: False
-      fsdp_size: -1
-  ulysses_sequence_parallel_size: 1 # sp size
-  use_dynamic_bsz: ${critic.use_dynamic_bsz}
-  forward_max_token_len_per_gpu: ${critic.forward_max_token_len_per_gpu}
 
 custom_reward_function:
   path: null
   name: compute_score
 
 algorithm:
-  gamma: 1.0
-  lam: 1.0
-  adv_estimator: grpo
   kl_penalty: kl  # how to estimate kl divergence
   kl_ctrl:
     type: fixed
@@ -158,19 +55,10 @@ algorithm:
 
 trainer:
   balance_batch: True
-  total_epochs: 20
-  project_name: grpo_math
-  experiment_name: grpo_math_example
   logger: [ 'console','wandb' ]
-  val_generations_to_log_to_wandb: 0
-  nnodes: 1
-  n_gpus_per_node: 2
   # auto: find the last ckpt to resume. If can't find, start from scratch
   resume_mode: auto # or auto or resume_path if
-  test_freq: 5
-  critic_warmup: 0
   default_hdfs_dir: null
   remove_previous_ckpt_in_save: False
   del_local_ckpt_after_load: False
-  default_local_dir: checkpoints/${trainer.project_name}/${trainer.experiment_name}
   val_before_train: False
diff --git a/examples/grpo_sciworld/train_sciworld.yaml b/examples/grpo_sciworld/train_sciworld.yaml
index 833441142c..169b0fe020 100644
--- a/examples/grpo_sciworld/train_sciworld.yaml
+++ b/examples/grpo_sciworld/train_sciworld.yaml
@@ -1,23 +1,6 @@
-data:
-  tokenizer: null
-  train_files: train_example.parquet
-  val_files: test_example.parquet
-  prompt_key: prompt
-  max_prompt_length: 4096
-  max_response_length: 16384
-  train_batch_size: 96
-  val_batch_size: null
-  return_raw_input_ids: False  # This should be set to true when the tokenizer between policy and rm differs
-  return_raw_chat: False
-  shuffle: True
-  filter_overlong_prompts: False # for large-scale dataset, filtering overlong prompts could be timeconsuming. You should disable this and set `truncation='left'
-  truncation: error
-  image_key: images
-
 actor_rollout_ref:
   hybrid_engine: True
   model:
-    path: /PATH/TO/MODEL/CHECKPOINT/
     external_lib: null
     override_config: { }
     enable_gradient_checkpointing: True
@@ -28,11 +11,6 @@ actor_rollout_ref:
     ppo_micro_batch_size_per_gpu: 1
     use_dynamic_bsz: False
     ppo_max_token_len_per_gpu: 16384 # n * ${data.max_prompt_length} + ${data.max_response_length}
-    grad_clip: 1.0
-    clip_ratio: 0.2
-    entropy_coeff: 0.001
-    use_kl_loss: True # True for GRPO
-    kl_loss_coef: 0.001 # for grpo
     kl_loss_type: low_var_kl # for grpo
     ppo_epochs: 1
     shuffle: False
@@ -60,95 +38,12 @@ actor_rollout_ref:
     log_prob_use_dynamic_bsz: ${actor_rollout_ref.actor.use_dynamic_bsz}
     log_prob_max_token_len_per_gpu: ${actor_rollout_ref.actor.ppo_max_token_len_per_gpu}
     ulysses_sequence_parallel_size: ${actor_rollout_ref.actor.ulysses_sequence_parallel_size} # sp size
-  rollout:
-    name: vllm
-    temperature: 1.0
-    top_k: -1 # 0 for hf rollout, -1 for vllm rollout
-    top_p: 1
-    use_fire_sampling: False # https://arxiv.org/abs/2410.21236
-    prompt_length: ${data.max_prompt_length}  # not use for opensource
-    response_length: ${data.max_response_length}
-    # for vllm rollout
-    dtype: bfloat16 # should align with FSDP
-    gpu_memory_utilization: 0.4
-    ignore_eos: False
-    enforce_eager: True
-    free_cache_engine: True
-    load_format: dummy_dtensor
-    tensor_model_parallel_size: 1
-    max_num_batched_tokens: 8192
-    max_model_len: null
-    max_num_seqs: 1024
-    log_prob_micro_batch_size_per_gpu: 1
-    log_prob_use_dynamic_bsz: ${actor_rollout_ref.actor.use_dynamic_bsz}
-    log_prob_max_token_len_per_gpu: ${actor_rollout_ref.actor.ppo_max_token_len_per_gpu}
-    disable_log_stats: True
-    enable_chunked_prefill: True # could get higher throughput
-    # for hf rollout
-    do_sample: True
-    # number of responses (i.e. num sample times)
-    n: 8 # should be > 1 for grpo; Currently is unused parameter
-
-critic:
-  strategy: fsdp
-  optim:
-    lr: 1e-5
-    lr_warmup_steps_ratio: 0.  # the total steps will be injected during runtime
-    # min_lr_ratio: null   # only useful for warmup with cosine
-    warmup_style: constant  # select from constant/cosine
-    total_training_steps: -1  # must be override by program
-  model:
-    path: /PATH/TO/MODEL/CHECKPOINT/
-    tokenizer_path: ${actor_rollout_ref.model.path}
-    override_config: { }
-    external_lib: ${actor_rollout_ref.model.external_lib}
-    enable_gradient_checkpointing: True
-    use_remove_padding: False
-    fsdp_config:
-      param_offload: False
-      optimizer_offload: False
-      wrap_policy:
-        # transformer_layer_cls_to_wrap: None
-        min_num_params: 0
-      fsdp_size: -1
-  ppo_mini_batch_size: ${actor_rollout_ref.actor.ppo_mini_batch_size}
-  ppo_micro_batch_size_per_gpu: 1
-  forward_micro_batch_size_per_gpu: ${critic.ppo_micro_batch_size_per_gpu}
-  use_dynamic_bsz: ${actor_rollout_ref.actor.use_dynamic_bsz}
-  ppo_max_token_len_per_gpu: 16384 # (${actor_rollout_ref.actor.ppo_max_token_len_per_gpu}) * 2
-  forward_max_token_len_per_gpu: ${critic.ppo_max_token_len_per_gpu}
-  ulysses_sequence_parallel_size: 1 # sp size
-  ppo_epochs: ${actor_rollout_ref.actor.ppo_epochs}
-  shuffle: ${actor_rollout_ref.actor.shuffle}
-  grad_clip: 1.0
-  cliprange_value: 0.5
-
-reward_model:
-  enable: False
-  strategy: fsdp
-  model:
-    input_tokenizer: ${actor_rollout_ref.model.path}  # set this to null if the chat template is identical
-    path: ~/models/FsfairX-LLaMA3-RM-v0.1
-    external_lib: ${actor_rollout_ref.model.external_lib}
-    use_remove_padding: False
-    fsdp_config:
-      min_num_params: 0
-      param_offload: False
-      fsdp_size: -1
-  # micro_batch_size_per_gpu: 2 # set a number
-  # max_length: null
-  ulysses_sequence_parallel_size: 1 # sp size
-  use_dynamic_bsz: ${critic.use_dynamic_bsz}
-  forward_max_token_len_per_gpu: ${critic.forward_max_token_len_per_gpu}
 
 custom_reward_function:
   path: null
   name: compute_score
 
 algorithm:
-  gamma: 1.0
-  lam: 1.0
-  adv_estimator: grpo
   kl_penalty: kl  # how to estimate kl divergence
   kl_ctrl:
     type: fixed
@@ -156,20 +51,11 @@ algorithm:
 
 trainer:
   balance_batch: True
-  total_epochs: 15
   # total_training_steps: null
-  project_name: sciworld
-  experiment_name: sciworld_RFT
   logger: [ 'wandb' ]
-  val_generations_to_log_to_wandb: 0
-  nnodes: 1
-  n_gpus_per_node: 2
   # auto: find the last ckpt to resume. If can't find, start from scratch
   resume_mode: auto # or auto or resume_path if
-  test_freq: 100
-  critic_warmup: 0
   default_hdfs_dir: null
   remove_previous_ckpt_in_save: False
   del_local_ckpt_after_load: False
-  default_local_dir: checkpoints/${trainer.project_name}/${trainer.experiment_name}
   val_before_train: False
diff --git a/examples/grpo_webshop/train_webshop.yaml b/examples/grpo_webshop/train_webshop.yaml
index ac502fec3f..169b0fe020 100644
--- a/examples/grpo_webshop/train_webshop.yaml
+++ b/examples/grpo_webshop/train_webshop.yaml
@@ -1,23 +1,6 @@
-data:
-  tokenizer: null
-  train_files: train_example.parquet
-  val_files: test_example.parquet
-  prompt_key: prompt
-  max_prompt_length: 4096
-  max_response_length: 16384
-  train_batch_size: 96
-  val_batch_size: null
-  return_raw_input_ids: False  # This should be set to true when the tokenizer between policy and rm differs
-  return_raw_chat: False
-  shuffle: True
-  filter_overlong_prompts: False # for large-scale dataset, filtering overlong prompts could be timeconsuming. You should disable this and set `truncation='left'
-  truncation: error
-  image_key: images
-
 actor_rollout_ref:
   hybrid_engine: True
   model:
-    path: /PATH/TO/MODEL/CHECKPOINT/
     external_lib: null
     override_config: { }
     enable_gradient_checkpointing: True
@@ -25,15 +8,9 @@ actor_rollout_ref:
   actor:
     strategy: fsdp  # This is for backward-compatibility
     ppo_mini_batch_size: 1536
-    # ppo_micro_batch_size: 8 # will be deprecated, use ppo_micro_batch_size_per_gpu
     ppo_micro_batch_size_per_gpu: 1
     use_dynamic_bsz: False
     ppo_max_token_len_per_gpu: 16384 # n * ${data.max_prompt_length} + ${data.max_response_length}
-    grad_clip: 1.0
-    clip_ratio: 0.2
-    entropy_coeff: 0.001
-    use_kl_loss: True # True for GRPO
-    kl_loss_coef: 0.001 # for grpo
     kl_loss_type: low_var_kl # for grpo
     ppo_epochs: 1
     shuffle: False
@@ -57,103 +34,16 @@ actor_rollout_ref:
       wrap_policy:
         # transformer_layer_cls_to_wrap: None
         min_num_params: 0
-    # log_prob_micro_batch_size: 4 # will be deprecated, use log_prob_micro_batch_size_per_gpu
     log_prob_micro_batch_size_per_gpu: 1
     log_prob_use_dynamic_bsz: ${actor_rollout_ref.actor.use_dynamic_bsz}
     log_prob_max_token_len_per_gpu: ${actor_rollout_ref.actor.ppo_max_token_len_per_gpu}
     ulysses_sequence_parallel_size: ${actor_rollout_ref.actor.ulysses_sequence_parallel_size} # sp size
-  rollout:
-    name: vllm
-    temperature: 1.0
-    top_k: -1 # 0 for hf rollout, -1 for vllm rollout
-    top_p: 1
-    use_fire_sampling: False # https://arxiv.org/abs/2410.21236
-    prompt_length: ${data.max_prompt_length}  # not use for opensource
-    response_length: ${data.max_response_length}
-    # for vllm rollout
-    dtype: bfloat16 # should align with FSDP
-    gpu_memory_utilization: 0.4
-    ignore_eos: False
-    enforce_eager: True
-    free_cache_engine: True
-    load_format: dummy_dtensor
-    tensor_model_parallel_size: 1
-    max_num_batched_tokens: 8192
-    max_model_len: null
-    max_num_seqs: 1024
-    # log_prob_micro_batch_size: 8 # will be deprecated, use log_prob_micro_batch_size_per_gpu
-    log_prob_micro_batch_size_per_gpu: 1
-    log_prob_use_dynamic_bsz: ${actor_rollout_ref.actor.use_dynamic_bsz}
-    log_prob_max_token_len_per_gpu: ${actor_rollout_ref.actor.ppo_max_token_len_per_gpu}
-    disable_log_stats: True
-    enable_chunked_prefill: True # could get higher throughput
-    # for hf rollout
-    do_sample: True
-    # number of responses (i.e. num sample times)
-    n: 8 # should be > 1 for grpo; Currently is unused parameter
-
-critic:
-  strategy: fsdp
-  optim:
-    lr: 1e-5
-    lr_warmup_steps_ratio: 0.  # the total steps will be injected during runtime
-    # min_lr_ratio: null   # only useful for warmup with cosine
-    warmup_style: constant  # select from constant/cosine
-    total_training_steps: -1  # must be override by program
-  model:
-    path: /PATH/TO/MODEL/CHECKPOINT/
-    tokenizer_path: ${actor_rollout_ref.model.path}
-    override_config: { }
-    external_lib: ${actor_rollout_ref.model.external_lib}
-    enable_gradient_checkpointing: True
-    use_remove_padding: False
-    fsdp_config:
-      param_offload: False
-      optimizer_offload: False
-      wrap_policy:
-        # transformer_layer_cls_to_wrap: None
-        min_num_params: 0
-      fsdp_size: -1
-  ppo_mini_batch_size: ${actor_rollout_ref.actor.ppo_mini_batch_size}
-  # ppo_micro_batch_size: 8 # will be deprecated, use ppo_micro_batch_size_per_gpu
-  ppo_micro_batch_size_per_gpu: 1
-  forward_micro_batch_size_per_gpu: ${critic.ppo_micro_batch_size_per_gpu}
-  use_dynamic_bsz: ${actor_rollout_ref.actor.use_dynamic_bsz}
-  ppo_max_token_len_per_gpu: 16384 # (${actor_rollout_ref.actor.ppo_max_token_len_per_gpu}) * 2
-  forward_max_token_len_per_gpu: ${critic.ppo_max_token_len_per_gpu}
-  ulysses_sequence_parallel_size: 1 # sp size
-  ppo_epochs: ${actor_rollout_ref.actor.ppo_epochs}
-  shuffle: ${actor_rollout_ref.actor.shuffle}
-  grad_clip: 1.0
-  cliprange_value: 0.5
-
-reward_model:
-  enable: False
-  strategy: fsdp
-  model:
-    input_tokenizer: ${actor_rollout_ref.model.path}  # set this to null if the chat template is identical
-    path: ~/models/FsfairX-LLaMA3-RM-v0.1
-    external_lib: ${actor_rollout_ref.model.external_lib}
-    use_remove_padding: False
-    fsdp_config:
-      min_num_params: 0
-      param_offload: False
-      fsdp_size: -1
-  # micro_batch_size: null # will be deprecated, use micro_batch_size_per_gpu
-  # micro_batch_size_per_gpu: 2 # set a number
-  # max_length: null
-  ulysses_sequence_parallel_size: 1 # sp size
-  use_dynamic_bsz: ${critic.use_dynamic_bsz}
-  forward_max_token_len_per_gpu: ${critic.forward_max_token_len_per_gpu}
 
 custom_reward_function:
   path: null
   name: compute_score
 
 algorithm:
-  gamma: 1.0
-  lam: 1.0
-  adv_estimator: grpo
   kl_penalty: kl  # how to estimate kl divergence
   kl_ctrl:
     type: fixed
@@ -161,20 +51,11 @@ algorithm:
 
 trainer:
   balance_batch: True
-  total_epochs: 15
   # total_training_steps: null
-  project_name: WEBSHOP
-  experiment_name: WEBSHOP_RFT
   logger: [ 'wandb' ]
-  val_generations_to_log_to_wandb: 0
-  nnodes: 1
-  n_gpus_per_node: 2
   # auto: find the last ckpt to resume. If can't find, start from scratch
   resume_mode: auto # or auto or resume_path if
-  test_freq: 100
-  critic_warmup: 0
   default_hdfs_dir: null
   remove_previous_ckpt_in_save: False
   del_local_ckpt_after_load: False
-  default_local_dir: checkpoints/${trainer.project_name}/${trainer.experiment_name}
   val_before_train: False
diff --git a/examples/opmd_gsm8k/train_opmd_gsm8k.yaml b/examples/opmd_gsm8k/train_opmd_gsm8k.yaml
index 88f92fb461..22c49da486 100644
--- a/examples/opmd_gsm8k/train_opmd_gsm8k.yaml
+++ b/examples/opmd_gsm8k/train_opmd_gsm8k.yaml
@@ -22,26 +22,9 @@
 #     adv_estimator: grpo  # merely to disable critic model, doesn't affect adv compute when algorithm_type is opmd
 
 
-data:
-  tokenizer: null
-  train_files: /train.jsonl
-  val_files: /test.jsonl
-  prompt_key: prompt
-  max_prompt_length: 256
-  max_response_length: 1024
-  train_batch_size: 256
-  val_batch_size: null
-  return_raw_input_ids: False  # This should be set to true when the tokenizer between policy and rm differs
-  return_raw_chat: False
-  shuffle: True
-  filter_overlong_prompts: False # for large-scale dataset, filtering overlong prompts could be timeconsuming. You should disable this and set `truncation='left'
-  truncation: error
-  image_key: images
-
 actor_rollout_ref:
   hybrid_engine: True
   model:
-    path: path_to_models/Qwen2.5-1.5B-Inst
     external_lib: null
     override_config: { }
     enable_gradient_checkpointing: True
@@ -49,15 +32,9 @@ actor_rollout_ref:
   actor:
     strategy: fsdp  # This is for backward-compatibility
     ppo_mini_batch_size: 128
-    # ppo_micro_batch_size: 8 # will be deprecated, use ppo_micro_batch_size_per_gpu
     ppo_micro_batch_size_per_gpu: 4
     use_dynamic_bsz: True
     ppo_max_token_len_per_gpu: 16384 # n * ${data.max_prompt_length} + ${data.max_response_length}
-    grad_clip: 1.0
-    clip_ratio: 0.2
-    entropy_coeff: 0.000
-    use_kl_loss: True
-    kl_loss_coef: 0.001
     kl_loss_type: mse
     ppo_epochs: 1
     shuffle: False
@@ -87,103 +64,16 @@ actor_rollout_ref:
       wrap_policy:
         # transformer_layer_cls_to_wrap: None
         min_num_params: 0
-    # log_prob_micro_batch_size: 4 # will be deprecated, use log_prob_micro_batch_size_per_gpu
     log_prob_micro_batch_size_per_gpu: 16
     log_prob_use_dynamic_bsz: ${actor_rollout_ref.actor.use_dynamic_bsz}
     log_prob_max_token_len_per_gpu: ${actor_rollout_ref.actor.ppo_max_token_len_per_gpu}
     ulysses_sequence_parallel_size: ${actor_rollout_ref.actor.ulysses_sequence_parallel_size} # sp size
-  rollout:
-    name: vllm
-    temperature: 1.0
-    top_k: -1 # 0 for hf rollout, -1 for vllm rollout
-    top_p: 1
-    use_fire_sampling: False # https://arxiv.org/abs/2410.21236
-    prompt_length: ${data.max_prompt_length}  # not use for opensource
-    response_length: ${data.max_response_length}
-    # for vllm rollout
-    dtype: bfloat16 # should align with FSDP
-    gpu_memory_utilization: 0.4
-    ignore_eos: False
-    enforce_eager: True
-    free_cache_engine: True
-    load_format: dummy_dtensor
-    tensor_model_parallel_size: 2
-    max_num_batched_tokens: 8192
-    max_model_len: null
-    max_num_seqs: 1024
-    # log_prob_micro_batch_size: 8 # will be deprecated, use log_prob_micro_batch_size_per_gpu
-    log_prob_micro_batch_size_per_gpu: 4
-    log_prob_use_dynamic_bsz: ${actor_rollout_ref.actor.use_dynamic_bsz}
-    log_prob_max_token_len_per_gpu: ${actor_rollout_ref.actor.ppo_max_token_len_per_gpu}
-    disable_log_stats: True
-    enable_chunked_prefill: True # could get higher throughput
-    # for hf rollout
-    do_sample: True
-    # number of responses (i.e. num sample times)
-    n: 8 # > 1 for grpo
-
-critic:
-  strategy: fsdp
-  optim:
-    lr: 1e-5
-    lr_warmup_steps_ratio: 0.  # the total steps will be injected during runtime
-    # min_lr_ratio: null   # only useful for warmup with cosine
-    warmup_style: constant  # select from constant/cosine
-    total_training_steps: -1  # must be override by program
-  model:
-    path: path_to_models/Qwen2.5-1.5B-Inst
-    tokenizer_path: ${actor_rollout_ref.model.path}
-    override_config: { }
-    external_lib: ${actor_rollout_ref.model.external_lib}
-    enable_gradient_checkpointing: True
-    use_remove_padding: False
-    fsdp_config:
-      param_offload: False
-      optimizer_offload: False
-      wrap_policy:
-        # transformer_layer_cls_to_wrap: None
-        min_num_params: 0
-      fsdp_size: -1
-  ppo_mini_batch_size: ${actor_rollout_ref.actor.ppo_mini_batch_size}
-  # ppo_micro_batch_size: 8 # will be deprecated, use ppo_micro_batch_size_per_gpu
-  ppo_micro_batch_size_per_gpu: 64
-  forward_micro_batch_size_per_gpu: ${critic.ppo_micro_batch_size_per_gpu}
-  use_dynamic_bsz: ${actor_rollout_ref.actor.use_dynamic_bsz}
-  ppo_max_token_len_per_gpu: 32768 # (${actor_rollout_ref.actor.ppo_max_token_len_per_gpu}) * 2
-  forward_max_token_len_per_gpu: ${critic.ppo_max_token_len_per_gpu}
-  ulysses_sequence_parallel_size: 1 # sp size
-  ppo_epochs: ${actor_rollout_ref.actor.ppo_epochs}
-  shuffle: ${actor_rollout_ref.actor.shuffle}
-  grad_clip: 1.0
-  cliprange_value: 0.5
-
-reward_model:
-  enable: False
-  strategy: fsdp
-  model:
-    input_tokenizer: ${actor_rollout_ref.model.path}  # set this to null if the chat template is identical
-    path: ~/models/FsfairX-LLaMA3-RM-v0.1
-    external_lib: ${actor_rollout_ref.model.external_lib}
-    use_remove_padding: False
-    fsdp_config:
-      min_num_params: 0
-      param_offload: False
-      fsdp_size: -1
-  # micro_batch_size: null # will be deprecated, use micro_batch_size_per_gpu
-  # micro_batch_size_per_gpu: 2 # set a number
-  # max_length: null
-  ulysses_sequence_parallel_size: 1 # sp size
-  use_dynamic_bsz: ${critic.use_dynamic_bsz}
-  forward_max_token_len_per_gpu: ${critic.forward_max_token_len_per_gpu}
 
 custom_reward_function:
   path: null
   name: compute_score
 
 algorithm:
-  gamma: 1.0
-  lam: 1.0
-  adv_estimator: grpo
   kl_penalty: kl  # how to estimate kl divergence
   kl_ctrl:
     type: fixed
@@ -191,22 +81,13 @@ algorithm:
 
 trainer:
   balance_batch: True
-  total_epochs: 10
   # total_training_steps: null
-  project_name: Trinity-RFT-gsm8k-test-opmd
-  experiment_name: qwen2.5-1.5B-gsm8k-opmd-kl_0.001-entropy_0-tau_4-beta1_0.0-beta2_0.95-lr_2e-6-sync10
   logger: [ 'console','wandb' ]
-  val_generations_to_log_to_wandb: 0
-  nnodes: 1
-  n_gpus_per_node: 2
   # auto: find the last ckpt to resume. If can't find, start from scratch
   resume_mode: auto # or auto or resume_path if
-  test_freq: 100
-  critic_warmup: 0
   default_hdfs_dir: null
   remove_previous_ckpt_in_save: False
   del_local_ckpt_after_load: False
-  default_local_dir: checkpoints/${trainer.project_name}/${trainer.experiment_name}
   val_before_train: False
   max_actor_ckpt_to_keep: 5
   max_critic_ckpt_to_keep: 5
diff --git a/examples/ppo_countdown/train_countdown.yaml b/examples/ppo_countdown/train_countdown.yaml
index 291afe452f..fbc0ecc1d0 100644
--- a/examples/ppo_countdown/train_countdown.yaml
+++ b/examples/ppo_countdown/train_countdown.yaml
@@ -1,23 +1,6 @@
-data:
-  tokenizer: null
-  train_files: train_example.parquet
-  val_files: test_example.parquet
-  prompt_key: prompt
-  max_prompt_length: 256
-  max_response_length: 1024
-  train_batch_size: 256
-  val_batch_size: null
-  return_raw_input_ids: False  # This should be set to true when the tokenizer between policy and rm differs
-  return_raw_chat: False
-  shuffle: True
-  filter_overlong_prompts: False # for large-scale dataset, filtering overlong prompts could be timeconsuming. You should disable this and set `truncation='left'
-  truncation: error
-  image_key: images
-
 actor_rollout_ref:
   hybrid_engine: True
   model:
-    path: /PATH/TO/MODEL/CHECKPOINT/
     external_lib: null
     override_config: { }
     enable_gradient_checkpointing: True
@@ -25,15 +8,9 @@ actor_rollout_ref:
   actor:
     strategy: fsdp  # This is for backward-compatibility
     ppo_mini_batch_size: 128
-    # ppo_micro_batch_size: 8 # will be deprecated, use ppo_micro_batch_size_per_gpu
     ppo_micro_batch_size_per_gpu: 4
     use_dynamic_bsz: True
     ppo_max_token_len_per_gpu: 16384 # n * ${data.max_prompt_length} + ${data.max_response_length}
-    grad_clip: 1.0
-    clip_ratio: 0.2
-    entropy_coeff: 0.001
-    use_kl_loss: False # True for GRPO
-    kl_loss_coef: 0.001 # for grpo
     kl_loss_type: low_var_kl # for grpo
     ppo_epochs: 1
     shuffle: False
@@ -63,40 +40,10 @@ actor_rollout_ref:
       wrap_policy:
         # transformer_layer_cls_to_wrap: None
         min_num_params: 0
-    # log_prob_micro_batch_size: 4 # will be deprecated, use log_prob_micro_batch_size_per_gpu
     log_prob_micro_batch_size_per_gpu: 8
     log_prob_use_dynamic_bsz: ${actor_rollout_ref.actor.use_dynamic_bsz}
     log_prob_max_token_len_per_gpu: ${actor_rollout_ref.actor.ppo_max_token_len_per_gpu}
     ulysses_sequence_parallel_size: ${actor_rollout_ref.actor.ulysses_sequence_parallel_size} # sp size
-  rollout:
-    name: vllm
-    temperature: 1.0
-    top_k: -1 # 0 for hf rollout, -1 for vllm rollout
-    top_p: 1
-    use_fire_sampling: False # https://arxiv.org/abs/2410.21236
-    prompt_length: ${data.max_prompt_length}  # not use for opensource
-    response_length: ${data.max_response_length}
-    # for vllm rollout
-    dtype: bfloat16 # should align with FSDP
-    gpu_memory_utilization: 0.4
-    ignore_eos: False
-    enforce_eager: True
-    free_cache_engine: True
-    load_format: dummy_dtensor
-    tensor_model_parallel_size: 2
-    max_num_batched_tokens: 8192
-    max_model_len: null
-    max_num_seqs: 1024
-    # log_prob_micro_batch_size: 8 # will be deprecated, use log_prob_micro_batch_size_per_gpu
-    log_prob_micro_batch_size_per_gpu: 4
-    log_prob_use_dynamic_bsz: ${actor_rollout_ref.actor.use_dynamic_bsz}
-    log_prob_max_token_len_per_gpu: ${actor_rollout_ref.actor.ppo_max_token_len_per_gpu}
-    disable_log_stats: True
-    enable_chunked_prefill: True # could get higher throughput
-    # for hf rollout
-    do_sample: True
-    # number of responses (i.e. num sample times)
-    n: 1 # > 1 for grpo
 
 critic:
   strategy: fsdp
@@ -107,8 +54,6 @@ critic:
     warmup_style: constant  # select from constant/cosine
     total_training_steps: -1  # must be override by program
   model:
-    path: /PATH/TO/MODEL/CHECKPOINT/
-    tokenizer_path: ${actor_rollout_ref.model.path}
     override_config: { }
     external_lib: ${actor_rollout_ref.model.external_lib}
     enable_gradient_checkpointing: True
@@ -121,7 +66,6 @@ critic:
         min_num_params: 0
       fsdp_size: -1
   ppo_mini_batch_size: ${actor_rollout_ref.actor.ppo_mini_batch_size}
-  # ppo_micro_batch_size: 8 # will be deprecated, use ppo_micro_batch_size_per_gpu
   ppo_micro_batch_size_per_gpu: 8
   forward_micro_batch_size_per_gpu: ${critic.ppo_micro_batch_size_per_gpu}
   use_dynamic_bsz: ${actor_rollout_ref.actor.use_dynamic_bsz}
@@ -133,33 +77,11 @@ critic:
   grad_clip: 1.0
   cliprange_value: 0.5
 
-reward_model:
-  enable: False
-  strategy: fsdp
-  model:
-    input_tokenizer: ${actor_rollout_ref.model.path}  # set this to null if the chat template is identical
-    path: ~/models/FsfairX-LLaMA3-RM-v0.1
-    external_lib: ${actor_rollout_ref.model.external_lib}
-    use_remove_padding: False
-    fsdp_config:
-      min_num_params: 0
-      param_offload: False
-      fsdp_size: -1
-  # micro_batch_size: null # will be deprecated, use micro_batch_size_per_gpu
-  # micro_batch_size_per_gpu: 2 # set a number
-  # max_length: null
-  ulysses_sequence_parallel_size: 1 # sp size
-  use_dynamic_bsz: ${critic.use_dynamic_bsz}
-  forward_max_token_len_per_gpu: ${critic.forward_max_token_len_per_gpu}
-
 custom_reward_function:
   path: null
   name: compute_score
 
 algorithm:
-  gamma: 1.0
-  lam: 1.0
-  adv_estimator: gae
   kl_penalty: kl  # how to estimate kl divergence
   kl_ctrl:
     type: fixed
@@ -167,22 +89,14 @@ algorithm:
 
 trainer:
   balance_batch: True
-  total_epochs: 15
   # total_training_steps: null
-  project_name: TinyZero
-  experiment_name: trinity-qwen2.5-1.5b
   logger: [ 'wandb' ]
-  val_generations_to_log_to_wandb: 0
-  nnodes: 1
-  n_gpus_per_node: 2
   # auto: find the last ckpt to resume. If can't find, start from scratch
   resume_mode: auto # or auto or resume_path if
-  test_freq: 100
   critic_warmup: 0
   default_hdfs_dir: null
   remove_previous_ckpt_in_save: False
   del_local_ckpt_after_load: False
-  default_local_dir: checkpoints/${trainer.project_name}/${trainer.experiment_name}
   val_before_train: False
   max_actor_ckpt_to_keep: 5
   max_critic_ckpt_to_keep: 5
diff --git a/tests/common/vllm_test.py b/tests/common/vllm_test.py
index 0146eb075c..e3dfaaa1df 100644
--- a/tests/common/vllm_test.py
+++ b/tests/common/vllm_test.py
@@ -162,50 +162,50 @@ def setUp(self):
         self.model_wrapper = ModelWrapper(self.engines[0], model_type="vllm_async")
 
 
-class TestModelWrapperAsyncTPV0(BaseTestModelWrapper, RayUnittestBase):
-    def setUp(self):
-        self.config = get_template_config()
-        self.config.mode = "explore"
-        self.config.model.model_path = get_model_path()
-        self.config.explorer.rollout_model.engine_type = "vllm_async"
-        self.config.explorer.rollout_model.engine_num = 2
-        self.config.explorer.rollout_model.tensor_parallel_size = 2
-        self.config.explorer.rollout_model.use_v1 = False
-        self.config.explorer.rollout_model.chat_template = CHAT_TEMPLATE
-        self.config.check_and_update()
-        self.engines, self.auxiliary_engines = create_inference_models(self.config)
-        self.model_wrapper = ModelWrapper(self.engines[0], model_type="vllm_async")
+# class TestModelWrapperAsyncTPV0(BaseTestModelWrapper, RayUnittestBase):
+#     def setUp(self):
+#         self.config = get_template_config()
+#         self.config.mode = "explore"
+#         self.config.model.model_path = get_model_path()
+#         self.config.explorer.rollout_model.engine_type = "vllm_async"
+#         self.config.explorer.rollout_model.engine_num = 2
+#         self.config.explorer.rollout_model.tensor_parallel_size = 2
+#         self.config.explorer.rollout_model.use_v1 = False
+#         self.config.explorer.rollout_model.chat_template = CHAT_TEMPLATE
+#         self.config.check_and_update()
+#         self.engines, self.auxiliary_engines = create_inference_models(self.config)
+#         self.model_wrapper = ModelWrapper(self.engines[0], model_type="vllm_async")
 
 
-class TestModelWrapperAsyncTPV1(BaseTestModelWrapper, RayUnittestBase):
-    def setUp(self):
-        self.config = get_template_config()
-        self.config.mode = "explore"
-        self.config.model.model_path = get_model_path()
-        self.config.explorer.rollout_model.engine_type = "vllm_async"
-        self.config.explorer.rollout_model.engine_num = 2
-        self.config.explorer.rollout_model.tensor_parallel_size = 2
-        self.config.explorer.rollout_model.use_v1 = True
-        self.config.explorer.rollout_model.chat_template = CHAT_TEMPLATE
-        self.config.algorithm.repeat_times = 2
-        self.config.check_and_update()
-        self.engines, self.auxiliary_engines = create_inference_models(self.config)
-        self.model_wrapper = ModelWrapper(self.engines[0], model_type="vllm_async")
+# class TestModelWrapperAsyncTPV1(BaseTestModelWrapper, RayUnittestBase):
+#     def setUp(self):
+#         self.config = get_template_config()
+#         self.config.mode = "explore"
+#         self.config.model.model_path = get_model_path()
+#         self.config.explorer.rollout_model.engine_type = "vllm_async"
+#         self.config.explorer.rollout_model.engine_num = 2
+#         self.config.explorer.rollout_model.tensor_parallel_size = 2
+#         self.config.explorer.rollout_model.use_v1 = True
+#         self.config.explorer.rollout_model.chat_template = CHAT_TEMPLATE
+#         self.config.algorithm.repeat_times = 2
+#         self.config.check_and_update()
+#         self.engines, self.auxiliary_engines = create_inference_models(self.config)
+#         self.model_wrapper = ModelWrapper(self.engines[0], model_type="vllm_async")
 
 
-class TestModelWrapperAsyncV1(BaseTestModelWrapper, RayUnittestBase):
-    def setUp(self):
-        self.config = get_template_config()
-        self.config.mode = "explore"
-        self.config.model.model_path = get_model_path()
-        self.config.explorer.rollout_model.engine_type = "vllm_async"
-        self.config.explorer.rollout_model.engine_num = 2
-        self.config.explorer.rollout_model.tensor_parallel_size = 1
-        self.config.explorer.rollout_model.use_v1 = True
-        self.config.explorer.rollout_model.chat_template = CHAT_TEMPLATE
-        self.config.check_and_update()
-        self.engines, self.auxiliary_engines = create_inference_models(self.config)
-        self.model_wrapper = ModelWrapper(self.engines[0], model_type="vllm_async")
+# class TestModelWrapperAsyncV1(BaseTestModelWrapper, RayUnittestBase):
+#     def setUp(self):
+#         self.config = get_template_config()
+#         self.config.mode = "explore"
+#         self.config.model.model_path = get_model_path()
+#         self.config.explorer.rollout_model.engine_type = "vllm_async"
+#         self.config.explorer.rollout_model.engine_num = 2
+#         self.config.explorer.rollout_model.tensor_parallel_size = 1
+#         self.config.explorer.rollout_model.use_v1 = True
+#         self.config.explorer.rollout_model.chat_template = CHAT_TEMPLATE
+#         self.config.check_and_update()
+#         self.engines, self.auxiliary_engines = create_inference_models(self.config)
+#         self.model_wrapper = ModelWrapper(self.engines[0], model_type="vllm_async")
 
 
 class TestAPIServer(RayUnittestBase):
diff --git a/tests/template/verl_config.yaml b/tests/template/verl_config.yaml
index d1e84cb455..0b6330c17e 100644
--- a/tests/template/verl_config.yaml
+++ b/tests/template/verl_config.yaml
@@ -8,15 +8,9 @@ actor_rollout_ref:
   actor:
     strategy: fsdp  # This is for backward-compatibility
     ppo_mini_batch_size: 4
-    # ppo_micro_batch_size: 8 # will be deprecated, use ppo_micro_batch_size_per_gpu
     ppo_micro_batch_size_per_gpu: 1
     use_dynamic_bsz: True
     ppo_max_token_len_per_gpu: 16384 # n * ${data.max_prompt_length} + ${data.max_response_length}
-    grad_clip: 1.0
-    clip_ratio: 0.2
-    entropy_coeff: 0.001
-    use_kl_loss: False # True for GRPO
-    kl_loss_coef: 0.001 # for grpo
     kl_loss_type: low_var_kl # for grpo
     ppo_epochs: 1
     shuffle: False
@@ -46,37 +40,10 @@ actor_rollout_ref:
       wrap_policy:
         # transformer_layer_cls_to_wrap: None
         min_num_params: 0
-    # log_prob_micro_batch_size: 4 # will be deprecated, use log_prob_micro_batch_size_per_gpu
     log_prob_micro_batch_size_per_gpu: 1
     log_prob_use_dynamic_bsz: ${actor_rollout_ref.actor.use_dynamic_bsz}
     log_prob_max_token_len_per_gpu: ${actor_rollout_ref.actor.ppo_max_token_len_per_gpu}
     ulysses_sequence_parallel_size: ${actor_rollout_ref.actor.ulysses_sequence_parallel_size} # sp size
-  rollout:
-    name: vllm
-    use_fire_sampling: False # https://arxiv.org/abs/2410.21236
-    prompt_length: ${data.max_prompt_length}  # not use for opensource
-    response_length: ${data.max_response_length}
-    # for vllm rollout
-    dtype: bfloat16 # should align with FSDP
-    gpu_memory_utilization: 0.4
-    ignore_eos: False
-    enforce_eager: True
-    free_cache_engine: True
-    load_format: dummy_dtensor
-    tensor_model_parallel_size: 2
-    max_num_batched_tokens: 8192
-    max_model_len: null
-    max_num_seqs: 1024
-    # log_prob_micro_batch_size: 8 # will be deprecated, use log_prob_micro_batch_size_per_gpu
-    log_prob_micro_batch_size_per_gpu: 1
-    log_prob_use_dynamic_bsz: ${actor_rollout_ref.actor.use_dynamic_bsz}
-    log_prob_max_token_len_per_gpu: ${actor_rollout_ref.actor.ppo_max_token_len_per_gpu}
-    disable_log_stats: True
-    enable_chunked_prefill: True # could get higher throughput
-    # for hf rollout
-    do_sample: True
-    # number of responses (i.e. num sample times)
-    n: 1 # > 1 for grpo
 
 critic:
   strategy: fsdp
@@ -87,7 +54,6 @@ critic:
     warmup_style: constant  # select from constant/cosine
     total_training_steps: -1  # must be override by program
   model:
-    tokenizer_path: ${actor_rollout_ref.model.path}
     override_config: { }
     external_lib: ${actor_rollout_ref.model.external_lib}
     enable_gradient_checkpointing: True
@@ -100,7 +66,6 @@ critic:
         min_num_params: 0
       fsdp_size: -1
   ppo_mini_batch_size: ${actor_rollout_ref.actor.ppo_mini_batch_size}
-  # ppo_micro_batch_size: 8 # will be deprecated, use ppo_micro_batch_size_per_gpu
   ppo_micro_batch_size_per_gpu: 1
   forward_micro_batch_size_per_gpu: ${critic.ppo_micro_batch_size_per_gpu}
   use_dynamic_bsz: ${actor_rollout_ref.actor.use_dynamic_bsz}
@@ -113,9 +78,6 @@ critic:
   cliprange_value: 0.5
 
 algorithm:
-  gamma: 1.0
-  lam: 1.0
-  adv_estimator: gae
   kl_penalty: kl  # how to estimate kl divergence
   kl_ctrl:
     type: fixed
@@ -123,22 +85,14 @@ algorithm:
 
 trainer:
   balance_batch: True
-  total_epochs: 10
   # total_training_steps: null
-  project_name: TinyZero
-  experiment_name: trinity-qwen2.5-1.5b
   logger: [ 'wandb' ]
-  val_generations_to_log_to_wandb: 0
-  nnodes: 1
-  n_gpus_per_node: 2
-  save_freq: 20
   # auto: find the last ckpt to resume. If can't find, start from scratch
   resume_mode: auto # or auto or resume_path if
   critic_warmup: 0
   default_hdfs_dir: null
   remove_previous_ckpt_in_save: False
   del_local_ckpt_after_load: False
-  default_local_dir: checkpoints/${trainer.project_name}/${trainer.experiment_name}
   val_before_train: False
   max_actor_ckpt_to_keep: 1
   max_critic_ckpt_to_keep: 1
diff --git a/trinity/common/config.py b/trinity/common/config.py
index 075d6e951f..db1e6b8712 100644
--- a/trinity/common/config.py
+++ b/trinity/common/config.py
@@ -293,7 +293,7 @@ class SynchronizerConfig:
     sync_interval: int = 1
     # waiting for `sync_timeout` seconds before timeout in `nccl` method
     sync_timeout: int = 1200
-    # wait for the lastest checkpoint to be ready
+    # wait for the lastest checkpoint to be ready  # TODO: to be used
     wait_for_checkpoint: bool = False
 
     # ! DO NOT SET, automatically calculated
@@ -339,7 +339,7 @@ def _check_interval(self) -> None:
             and self.algorithm.algorithm_type != AlgorithmType.DPO
             and self.explorer.eval_interval % self.synchronizer.sync_interval != 0
         ):
-            self.buffer.eval_interval = (
+            self.explorer.eval_interval = (
                 max(self.explorer.eval_interval // self.synchronizer.sync_interval, 1)
             ) * self.synchronizer.sync_interval
             logger.warning(
diff --git a/trinity/manager/config_manager.py b/trinity/manager/config_manager.py
index 2b7f5f5b8d..9ac2d36f16 100644
--- a/trinity/manager/config_manager.py
+++ b/trinity/manager/config_manager.py
@@ -50,29 +50,41 @@ def _init_default_config(self):
             "mode": "both",
             "project": "Trinity-RFT",
             "exp_name": "qwen2.5-1.5B",
+            "checkpoint_root_dir": "",
             "monitor_type": MonitorType.TENSORBOARD.value,
+            # Algorithm Configs
+            "algorithm_type": AlgorithmType.PPO.value,
+            "_grouped_adv_repeat_times": 2,
+            "_not_grouped_adv_repeat_times": 1,
+            "repeat_times": 1,
+            "gamma": 1.0,
+            "lam": 1.0,
             # Model Configs
             "model_path": "",
             "critic_model_path": "",
-            "checkpoint_root_dir": "",
+            "max_prompt_tokens": 1024,
+            "max_response_tokens": 1024,
+            # Cluster Config
             "node_num": 1,
             "gpu_per_node": 8,
             "total_gpu_num": 8,
             "trainer_gpu_num": 6,
-            "max_prompt_tokens": 1024,
-            "max_response_tokens": 1024,
-            # Global Configs
+            # Buffer Configs
             "total_epochs": 20,
             "_train_batch_size_per_gpu": 16,
             "train_batch_size": 96,
-            "eval_interval": 1000,
-            "algorithm_type": AlgorithmType.PPO.value,
+            "buffer_max_retry_times": 3,
+            "max_retry_interval": 1,
             # Taskset Configs
             "taskset_path": "",
             "taskset_subset_name": None,
             "taskset_split": "train",
             "taskset_prompt_key": "question",
             "taskset_response_key": "answer",
+            "temperature": 1.0,
+            "top_p": 1.0,  # TODO: to be used
+            "top_k": -1,  # TODO: to be used
+            "logprobs": 0,
             # Eval Taskset Configs
             "_eval_tasksets_num": 0,
             # Explorer Input Configs
@@ -80,15 +92,13 @@ def _init_default_config(self):
             "default_reward_fn_type": "math_reward",
             "system_prompt": None,
             "reply_prefix": None,
-            # Experience Buffer Configs
+            # Experience Buffer / DPO Dataset Configs
             "_dpo_storage_type": StorageType.FILE.value,
             "_not_dpo_storage_type": StorageType.QUEUE.value,
             "storage_type": StorageType.QUEUE.value,
             "_dpo_experience_buffer_path": "",
             "_not_dpo_experience_buffer_path": "",
             "experience_buffer_path": "",
-            "buffer_max_retry_times": 3,
-            "max_retry_interval": 1,
             "dpo_dataset_train_split": "train",
             "dpo_dataset_prompt_type": PromptType.MESSAGES.value,
             "dpo_dataset_prompt_key": "prompt",
@@ -101,30 +111,32 @@ def _init_default_config(self):
             "sft_warmup_messages_key": "messages",
             "sft_warmup_prompt_key": "prompt",
             "sft_warmup_response_key": "response",
+            # TrainerInput Configs
+            # TODO: read_experience_strategy
+            "sft_warmup_steps": 0,
             # Explorer and Sync Configs
+            "runner_num": 32,
+            "max_timeout": 900,
+            "explorer_max_retry_times": 2,
+            "eval_interval": 1000,
+            "eval_on_latest_checkpoint": True,
+            # Rollout Model Configs
             "engine_type": "vllm_async",
             "engine_num": 2,
-            "runner_num": 32,
-            "_grouped_adv_repeat_times": 2,
-            "_not_grouped_adv_repeat_times": 1,
-            "repeat_times": 1,
             "tensor_parallel_size": 1,
             "use_v1": True,
-            "enable_prefix_caching": False,
             "enforce_eager": True,
+            "enable_prefix_caching": False,
+            "enable_chunked_prefill": False,
+            "gpu_memory_utilization": 0.9,
             "dtype": "bfloat16",
-            "temperature": 1.0,
-            "top_p": 1.0,
-            "top_k": -1,
             "seed": 42,
-            "logprobs": 0,
-            "gpu_memory_utilization": 0.9,
-            "enable_chunked_prefill": False,
+            # TODO: max_prompt_tokens
+            # TODO: max_response_tokens
+            # TODO: chat_template
             "enable_thinking": False,
             "enable_openai_api": False,
-            "max_timeout": 900,
-            "explorer_max_retry_times": 2,
-            "eval_on_latest_checkpoint": True,
+            # TODO: Auxiliary Models Configs
             # Synchronizer Configs
             "_not_dpo_sync_method": SyncMethod.NCCL.value,
             "sync_method": SyncMethod.NCCL.value,
@@ -132,9 +144,15 @@ def _init_default_config(self):
             "sync_timeout": 1200,
             # Trainer Configs
             "trainer_type": "verl",
-            "sft_warmup_steps": 0,
             "_nccl_save_interval": 100,
             "save_interval": 100,
+            # TODO: enable_preview
+            "_not_dpo_actor_use_kl_loss": True,
+            "actor_use_kl_loss": True,
+            "actor_kl_loss_coef": 0.001,
+            "actor_entropy_coef": 0.001,
+            "actor_grad_clip": 1.0,
+            "actor_clip_ratio": 0.2,
             # veRL Trainer Configs
             "training_args": [
                 "balance_batch",
@@ -155,8 +173,6 @@ def _init_default_config(self):
             "del_local_ckpt_after_load": False,
             "max_actor_ckpt_to_keep": None,
             "max_critic_ckpt_to_keep": None,
-            "gamma": 1.0,
-            "lam": 1.0,
             "adv_estimator": "gae",
             "norm_adv_by_std_in_grpo": True,
             "use_kl_in_reward": False,
@@ -174,12 +190,6 @@ def _init_default_config(self):
             "actor_tau": 0.0,
             "actor_opmd_baseline": "mean",
             "actor_use_uid": False,
-            "actor_grad_clip": 1.0,
-            "actor_clip_ratio": 0.2,
-            "actor_entropy_coef": 0.001,
-            "_not_dpo_actor_use_kl_loss": True,
-            "actor_use_kl_loss": True,
-            "actor_kl_loss_coef": 0.001,
             "actor_kl_loss_type": "low_var_kl",
             "actor_checkpoint": ["model", "hf_model", "optimizer", "extra"],
             "critic_lr": 1e-6,
@@ -580,7 +590,7 @@ def _str_for_engine_num_and_tp_size(self):
 ```"""
 
     def _set_engine_num(self):
-        total_gpu_num = st.session_state["gpu_per_node"] * st.session_state["node_num"]
+        total_gpu_num = st.session_state["total_gpu_num"]
         max_engine_num = (total_gpu_num - 1) // st.session_state["tensor_parallel_size"]
         if st.session_state["engine_num"] > max_engine_num:
             st.session_state["engine_num"] = max_engine_num
@@ -596,7 +606,7 @@ def _set_engine_num(self):
         )
 
     def _set_tensor_parallel_size(self):
-        total_gpu_num = st.session_state["gpu_per_node"] * st.session_state["node_num"]
+        total_gpu_num = st.session_state["total_gpu_num"]
         max_tensor_parallel_size = (total_gpu_num - 1) // st.session_state["engine_num"]
         if st.session_state["tensor_parallel_size"] > max_tensor_parallel_size:
             st.session_state["tensor_parallel_size"] = max_tensor_parallel_size
@@ -1434,13 +1444,6 @@ def _generate_verl_config(self):
                         "actor_ulysses_sequence_parallel_size"
                     ],
                 },
-                "rollout": {
-                    "temperature": st.session_state["temperature"],
-                    "n": st.session_state["repeat_times"],
-                },
-            },
-            "reward_model": {
-                "enable": False,
             },
             "custom_reward_function": {"path": None, "name": "compute_score"},
             "algorithm": {
@@ -1455,8 +1458,6 @@ def _generate_verl_config(self):
                 "logger": ["tensorboard"],
                 "resume_mode": st.session_state["resume_mode"],
                 "resume_from_path": st.session_state["resume_from_path"],
-                "test_freq": 100,
-                "critic_warmup": st.session_state["critic_warmup"],
                 "default_hdfs_dir": st.session_state["default_hdfs_dir"],
                 "remove_previous_ckpt_in_save": st.session_state["remove_previous_ckpt_in_save"],
                 "del_local_ckpt_after_load": st.session_state["del_local_ckpt_after_load"],
@@ -1467,6 +1468,7 @@ def _generate_verl_config(self):
         }
 
         if st.session_state["adv_estimator"] == AdvantageEstimator.GAE.value:
+            trainer_config["trainer"]["critic_warmup"] = st.session_state["critic_warmup"]
             trainer_config["critic"] = {
                 "strategy": st.session_state["training_strategy"],
                 "optim": {
diff --git a/trinity/trainer/verl_trainer.py b/trinity/trainer/verl_trainer.py
index 090a5ff881..7590d6075b 100644
--- a/trinity/trainer/verl_trainer.py
+++ b/trinity/trainer/verl_trainer.py
@@ -416,17 +416,6 @@ def train_rft_step(self, experiences: Experiences) -> Tuple[bool, int]:
                 actor_output_metrics = reduce_metrics(actor_output.meta_info["metrics"])
                 metrics.update(actor_output_metrics)
 
-            # validate
-            if (
-                self.val_reward_fn is not None
-                and self.config.trainer.test_freq > 0
-                and self.global_steps % self.config.trainer.test_freq == 0
-            ):
-                pass  # TODO: may add validation
-                # with _timer("testing", timing_raw):
-                #     val_metrics: dict = self._validate()
-                # metrics.update(val_metrics)
-
             if (
                 self.config.trainer.save_freq > 0
                 and self.global_steps % self.config.trainer.save_freq == 0

From c3488d59622c3c8bf8e30a96d2b6d18f61c48563 Mon Sep 17 00:00:00 2001
From: chenyushuo <297086016@qq.com>
Date: Fri, 23 May 2025 14:50:28 +0800
Subject: [PATCH 4/6] fix test

---
 tests/common/vllm_test.py | 80 +++++++++++++++++++--------------------
 1 file changed, 40 insertions(+), 40 deletions(-)

diff --git a/tests/common/vllm_test.py b/tests/common/vllm_test.py
index e3dfaaa1df..0146eb075c 100644
--- a/tests/common/vllm_test.py
+++ b/tests/common/vllm_test.py
@@ -162,50 +162,50 @@ def setUp(self):
         self.model_wrapper = ModelWrapper(self.engines[0], model_type="vllm_async")
 
 
-# class TestModelWrapperAsyncTPV0(BaseTestModelWrapper, RayUnittestBase):
-#     def setUp(self):
-#         self.config = get_template_config()
-#         self.config.mode = "explore"
-#         self.config.model.model_path = get_model_path()
-#         self.config.explorer.rollout_model.engine_type = "vllm_async"
-#         self.config.explorer.rollout_model.engine_num = 2
-#         self.config.explorer.rollout_model.tensor_parallel_size = 2
-#         self.config.explorer.rollout_model.use_v1 = False
-#         self.config.explorer.rollout_model.chat_template = CHAT_TEMPLATE
-#         self.config.check_and_update()
-#         self.engines, self.auxiliary_engines = create_inference_models(self.config)
-#         self.model_wrapper = ModelWrapper(self.engines[0], model_type="vllm_async")
+class TestModelWrapperAsyncTPV0(BaseTestModelWrapper, RayUnittestBase):
+    def setUp(self):
+        self.config = get_template_config()
+        self.config.mode = "explore"
+        self.config.model.model_path = get_model_path()
+        self.config.explorer.rollout_model.engine_type = "vllm_async"
+        self.config.explorer.rollout_model.engine_num = 2
+        self.config.explorer.rollout_model.tensor_parallel_size = 2
+        self.config.explorer.rollout_model.use_v1 = False
+        self.config.explorer.rollout_model.chat_template = CHAT_TEMPLATE
+        self.config.check_and_update()
+        self.engines, self.auxiliary_engines = create_inference_models(self.config)
+        self.model_wrapper = ModelWrapper(self.engines[0], model_type="vllm_async")
 
 
-# class TestModelWrapperAsyncTPV1(BaseTestModelWrapper, RayUnittestBase):
-#     def setUp(self):
-#         self.config = get_template_config()
-#         self.config.mode = "explore"
-#         self.config.model.model_path = get_model_path()
-#         self.config.explorer.rollout_model.engine_type = "vllm_async"
-#         self.config.explorer.rollout_model.engine_num = 2
-#         self.config.explorer.rollout_model.tensor_parallel_size = 2
-#         self.config.explorer.rollout_model.use_v1 = True
-#         self.config.explorer.rollout_model.chat_template = CHAT_TEMPLATE
-#         self.config.algorithm.repeat_times = 2
-#         self.config.check_and_update()
-#         self.engines, self.auxiliary_engines = create_inference_models(self.config)
-#         self.model_wrapper = ModelWrapper(self.engines[0], model_type="vllm_async")
+class TestModelWrapperAsyncTPV1(BaseTestModelWrapper, RayUnittestBase):
+    def setUp(self):
+        self.config = get_template_config()
+        self.config.mode = "explore"
+        self.config.model.model_path = get_model_path()
+        self.config.explorer.rollout_model.engine_type = "vllm_async"
+        self.config.explorer.rollout_model.engine_num = 2
+        self.config.explorer.rollout_model.tensor_parallel_size = 2
+        self.config.explorer.rollout_model.use_v1 = True
+        self.config.explorer.rollout_model.chat_template = CHAT_TEMPLATE
+        self.config.algorithm.repeat_times = 2
+        self.config.check_and_update()
+        self.engines, self.auxiliary_engines = create_inference_models(self.config)
+        self.model_wrapper = ModelWrapper(self.engines[0], model_type="vllm_async")
 
 
-# class TestModelWrapperAsyncV1(BaseTestModelWrapper, RayUnittestBase):
-#     def setUp(self):
-#         self.config = get_template_config()
-#         self.config.mode = "explore"
-#         self.config.model.model_path = get_model_path()
-#         self.config.explorer.rollout_model.engine_type = "vllm_async"
-#         self.config.explorer.rollout_model.engine_num = 2
-#         self.config.explorer.rollout_model.tensor_parallel_size = 1
-#         self.config.explorer.rollout_model.use_v1 = True
-#         self.config.explorer.rollout_model.chat_template = CHAT_TEMPLATE
-#         self.config.check_and_update()
-#         self.engines, self.auxiliary_engines = create_inference_models(self.config)
-#         self.model_wrapper = ModelWrapper(self.engines[0], model_type="vllm_async")
+class TestModelWrapperAsyncV1(BaseTestModelWrapper, RayUnittestBase):
+    def setUp(self):
+        self.config = get_template_config()
+        self.config.mode = "explore"
+        self.config.model.model_path = get_model_path()
+        self.config.explorer.rollout_model.engine_type = "vllm_async"
+        self.config.explorer.rollout_model.engine_num = 2
+        self.config.explorer.rollout_model.tensor_parallel_size = 1
+        self.config.explorer.rollout_model.use_v1 = True
+        self.config.explorer.rollout_model.chat_template = CHAT_TEMPLATE
+        self.config.check_and_update()
+        self.engines, self.auxiliary_engines = create_inference_models(self.config)
+        self.model_wrapper = ModelWrapper(self.engines[0], model_type="vllm_async")
 
 
 class TestAPIServer(RayUnittestBase):

From 25ee0e17792d4e6dfa089cda5ced111fe8eed14b Mon Sep 17 00:00:00 2001
From: chenyushuo <297086016@qq.com>
Date: Fri, 23 May 2025 15:17:56 +0800
Subject: [PATCH 5/6] UPD on config_test

---
 tests/common/config_test.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/tests/common/config_test.py b/tests/common/config_test.py
index 35b9a4f9c7..e1ac0aa7d4 100644
--- a/tests/common/config_test.py
+++ b/tests/common/config_test.py
@@ -46,7 +46,8 @@ def test_all_examples_are_valid(self):
                     print(f"Checking config: {filename}")
                     config_path = os.path.join(example_dir, example_name, filename)
                     try:
-                        load_config(config_path)
+                        config = load_config(config_path)
+                        config.check_and_update()
                     except Exception as e:
                         print(f"Error loading config {config_path}: {e}")
                         raise e

From 5050bff1908644d4cc734e525cdd530e3c057527 Mon Sep 17 00:00:00 2001
From: chenyushuo <297086016@qq.com>
Date: Fri, 23 May 2025 16:09:09 +0800
Subject: [PATCH 6/6] use a better way to sync config

---
 .../source/tutorial/trinity_configs.md        |  8 ++++++-
 examples/async_gsm8k/verl_config.yaml         |  8 ++++++-
 examples/dpo_humanlike/train_dpo.yaml         |  8 ++++++-
 examples/grpo_alfworld/train_alfworld.yaml    |  8 ++++++-
 examples/grpo_gsm8k/train_gsm8k.yaml          |  8 ++++++-
 examples/grpo_math/train_math.yaml            |  8 ++++++-
 examples/grpo_sciworld/train_sciworld.yaml    |  8 ++++++-
 examples/grpo_webshop/train_webshop.yaml      |  8 ++++++-
 examples/opmd_gsm8k/train_opmd_gsm8k.yaml     |  8 ++++++-
 examples/ppo_countdown/train_countdown.yaml   |  8 ++++++-
 tests/template/verl_config.yaml               |  8 ++++++-
 trinity/common/config.py                      | 14 +++++------
 trinity/common/verl_config.py                 | 23 ++++++++++++-------
 13 files changed, 99 insertions(+), 26 deletions(-)

diff --git a/docs/sphinx_doc/source/tutorial/trinity_configs.md b/docs/sphinx_doc/source/tutorial/trinity_configs.md
index ea31b34351..fc599e206b 100644
--- a/docs/sphinx_doc/source/tutorial/trinity_configs.md
+++ b/docs/sphinx_doc/source/tutorial/trinity_configs.md
@@ -222,6 +222,11 @@ actor_rollout_ref:
     ppo_micro_batch_size_per_gpu: 4
     use_dynamic_bsz: True
     ppo_max_token_len_per_gpu: 16384 # n * ${data.max_prompt_length} + ${data.max_response_length}
+    grad_clip: 1.0
+    clip_ratio: 0.2
+    entropy_coeff: 0.001
+    use_kl_loss: False # True for GRPO
+    kl_loss_coef: 0.001 # for grpo
     kl_loss_type: low_var_kl # for grpo
     ppo_epochs: 1
     shuffle: False
@@ -294,6 +299,8 @@ custom_reward_function:
   name: compute_score
 
 algorithm:
+  gamma: 1.0
+  lam: 1.0
   norm_adv_by_std_in_grpo: True
   use_kl_in_reward: False
   kl_penalty: kl  # how to estimate kl divergence
@@ -306,7 +313,6 @@ algorithm:
 trainer:
   balance_batch: True
   # total_training_steps: null
-  logger: [ 'wandb' ]
   # auto: find the last ckpt to resume. If can't find, start from scratch
   resume_mode: auto # or auto or resume_path if
   resume_from_path: ""
diff --git a/examples/async_gsm8k/verl_config.yaml b/examples/async_gsm8k/verl_config.yaml
index 18d3060847..de1b08f590 100644
--- a/examples/async_gsm8k/verl_config.yaml
+++ b/examples/async_gsm8k/verl_config.yaml
@@ -11,6 +11,11 @@ actor_rollout_ref:
     ppo_micro_batch_size_per_gpu: 4
     use_dynamic_bsz: True # False
     ppo_max_token_len_per_gpu: 16384 # n * ${data.max_prompt_length} + ${data.max_response_length}
+    grad_clip: 1.0
+    clip_ratio: 0.2
+    entropy_coeff: 0.001
+    use_kl_loss: True # True for GRPO
+    kl_loss_coef: 0.001 # for grpo
     kl_loss_type: low_var_kl # for grpo
     ppo_epochs: 1
     shuffle: False
@@ -48,6 +53,8 @@ custom_reward_function:
   name: compute_score
 
 algorithm:
+  gamma: 1.0
+  lam: 1.0
   kl_penalty: kl  # how to estimate kl divergence
   kl_ctrl:
     type: fixed
@@ -56,7 +63,6 @@ algorithm:
 trainer:
   balance_batch: True
   # total_training_steps: null
-  logger: [ 'console','wandb' ]
   # auto: find the last ckpt to resume. If can't find, start from scratch
   resume_mode: auto # or auto or resume_path if
   default_hdfs_dir: null
diff --git a/examples/dpo_humanlike/train_dpo.yaml b/examples/dpo_humanlike/train_dpo.yaml
index ccad2baa37..8ffc68b397 100644
--- a/examples/dpo_humanlike/train_dpo.yaml
+++ b/examples/dpo_humanlike/train_dpo.yaml
@@ -11,6 +11,11 @@ actor_rollout_ref:
     ppo_micro_batch_size_per_gpu: 2 # NOTE
     use_dynamic_bsz: False
     ppo_max_token_len_per_gpu: 16384 # n * ${data.max_prompt_length} + ${data.max_response_length}
+    grad_clip: 1.0
+    clip_ratio: 0.2
+    entropy_coeff: 0.001
+    use_kl_loss: True
+    kl_loss_coef: 0.1 # NOTE: beta for DPO
     kl_loss_type: low_var_kl # for grpo
     ppo_epochs: 1
     shuffle: False
@@ -47,6 +52,8 @@ custom_reward_function:
   name: compute_score
 
 algorithm:
+  gamma: 1.0
+  lam: 1.0
   kl_penalty: kl
   kl_ctrl:
     type: fixed
@@ -55,7 +62,6 @@ algorithm:
 trainer:
   balance_batch: False
   total_training_steps: 783 #
-  logger: [ 'console','wandb' ]
   # auto: find the last ckpt to resume. If can't find, start from scratch
   resume_mode: auto # or auto or resume_path if
   default_hdfs_dir: null
diff --git a/examples/grpo_alfworld/train_alfworld.yaml b/examples/grpo_alfworld/train_alfworld.yaml
index 169b0fe020..215b1817ab 100644
--- a/examples/grpo_alfworld/train_alfworld.yaml
+++ b/examples/grpo_alfworld/train_alfworld.yaml
@@ -11,6 +11,11 @@ actor_rollout_ref:
     ppo_micro_batch_size_per_gpu: 1
     use_dynamic_bsz: False
     ppo_max_token_len_per_gpu: 16384 # n * ${data.max_prompt_length} + ${data.max_response_length}
+    grad_clip: 1.0
+    clip_ratio: 0.2
+    entropy_coeff: 0.001
+    use_kl_loss: True # True for GRPO
+    kl_loss_coef: 0.001 # for grpo
     kl_loss_type: low_var_kl # for grpo
     ppo_epochs: 1
     shuffle: False
@@ -44,6 +49,8 @@ custom_reward_function:
   name: compute_score
 
 algorithm:
+  gamma: 1.0
+  lam: 1.0
   kl_penalty: kl  # how to estimate kl divergence
   kl_ctrl:
     type: fixed
@@ -52,7 +59,6 @@ algorithm:
 trainer:
   balance_batch: True
   # total_training_steps: null
-  logger: [ 'wandb' ]
   # auto: find the last ckpt to resume. If can't find, start from scratch
   resume_mode: auto # or auto or resume_path if
   default_hdfs_dir: null
diff --git a/examples/grpo_gsm8k/train_gsm8k.yaml b/examples/grpo_gsm8k/train_gsm8k.yaml
index 18d3060847..de1b08f590 100644
--- a/examples/grpo_gsm8k/train_gsm8k.yaml
+++ b/examples/grpo_gsm8k/train_gsm8k.yaml
@@ -11,6 +11,11 @@ actor_rollout_ref:
     ppo_micro_batch_size_per_gpu: 4
     use_dynamic_bsz: True # False
     ppo_max_token_len_per_gpu: 16384 # n * ${data.max_prompt_length} + ${data.max_response_length}
+    grad_clip: 1.0
+    clip_ratio: 0.2
+    entropy_coeff: 0.001
+    use_kl_loss: True # True for GRPO
+    kl_loss_coef: 0.001 # for grpo
     kl_loss_type: low_var_kl # for grpo
     ppo_epochs: 1
     shuffle: False
@@ -48,6 +53,8 @@ custom_reward_function:
   name: compute_score
 
 algorithm:
+  gamma: 1.0
+  lam: 1.0
   kl_penalty: kl  # how to estimate kl divergence
   kl_ctrl:
     type: fixed
@@ -56,7 +63,6 @@ algorithm:
 trainer:
   balance_batch: True
   # total_training_steps: null
-  logger: [ 'console','wandb' ]
   # auto: find the last ckpt to resume. If can't find, start from scratch
   resume_mode: auto # or auto or resume_path if
   default_hdfs_dir: null
diff --git a/examples/grpo_math/train_math.yaml b/examples/grpo_math/train_math.yaml
index b783445bf7..78bcb862c6 100644
--- a/examples/grpo_math/train_math.yaml
+++ b/examples/grpo_math/train_math.yaml
@@ -11,6 +11,11 @@ actor_rollout_ref:
     ppo_micro_batch_size_per_gpu: 4
     use_dynamic_bsz: True # False
     ppo_max_token_len_per_gpu: 16384 # n * ${data.max_prompt_length} + ${data.max_response_length}
+    grad_clip: 1.0
+    clip_ratio: 0.2
+    entropy_coeff: 0.001
+    use_kl_loss: True # True for GRPO
+    kl_loss_coef: 0.0001 # for grpo
     kl_loss_type: low_var_kl # for grpo
     ppo_epochs: 1
     shuffle: False
@@ -48,6 +53,8 @@ custom_reward_function:
   name: compute_score
 
 algorithm:
+  gamma: 1.0
+  lam: 1.0
   kl_penalty: kl  # how to estimate kl divergence
   kl_ctrl:
     type: fixed
@@ -55,7 +62,6 @@ algorithm:
 
 trainer:
   balance_batch: True
-  logger: [ 'console','wandb' ]
   # auto: find the last ckpt to resume. If can't find, start from scratch
   resume_mode: auto # or auto or resume_path if
   default_hdfs_dir: null
diff --git a/examples/grpo_sciworld/train_sciworld.yaml b/examples/grpo_sciworld/train_sciworld.yaml
index 169b0fe020..215b1817ab 100644
--- a/examples/grpo_sciworld/train_sciworld.yaml
+++ b/examples/grpo_sciworld/train_sciworld.yaml
@@ -11,6 +11,11 @@ actor_rollout_ref:
     ppo_micro_batch_size_per_gpu: 1
     use_dynamic_bsz: False
     ppo_max_token_len_per_gpu: 16384 # n * ${data.max_prompt_length} + ${data.max_response_length}
+    grad_clip: 1.0
+    clip_ratio: 0.2
+    entropy_coeff: 0.001
+    use_kl_loss: True # True for GRPO
+    kl_loss_coef: 0.001 # for grpo
     kl_loss_type: low_var_kl # for grpo
     ppo_epochs: 1
     shuffle: False
@@ -44,6 +49,8 @@ custom_reward_function:
   name: compute_score
 
 algorithm:
+  gamma: 1.0
+  lam: 1.0
   kl_penalty: kl  # how to estimate kl divergence
   kl_ctrl:
     type: fixed
@@ -52,7 +59,6 @@ algorithm:
 trainer:
   balance_batch: True
   # total_training_steps: null
-  logger: [ 'wandb' ]
   # auto: find the last ckpt to resume. If can't find, start from scratch
   resume_mode: auto # or auto or resume_path if
   default_hdfs_dir: null
diff --git a/examples/grpo_webshop/train_webshop.yaml b/examples/grpo_webshop/train_webshop.yaml
index 169b0fe020..215b1817ab 100644
--- a/examples/grpo_webshop/train_webshop.yaml
+++ b/examples/grpo_webshop/train_webshop.yaml
@@ -11,6 +11,11 @@ actor_rollout_ref:
     ppo_micro_batch_size_per_gpu: 1
     use_dynamic_bsz: False
     ppo_max_token_len_per_gpu: 16384 # n * ${data.max_prompt_length} + ${data.max_response_length}
+    grad_clip: 1.0
+    clip_ratio: 0.2
+    entropy_coeff: 0.001
+    use_kl_loss: True # True for GRPO
+    kl_loss_coef: 0.001 # for grpo
     kl_loss_type: low_var_kl # for grpo
     ppo_epochs: 1
     shuffle: False
@@ -44,6 +49,8 @@ custom_reward_function:
   name: compute_score
 
 algorithm:
+  gamma: 1.0
+  lam: 1.0
   kl_penalty: kl  # how to estimate kl divergence
   kl_ctrl:
     type: fixed
@@ -52,7 +59,6 @@ algorithm:
 trainer:
   balance_batch: True
   # total_training_steps: null
-  logger: [ 'wandb' ]
   # auto: find the last ckpt to resume. If can't find, start from scratch
   resume_mode: auto # or auto or resume_path if
   default_hdfs_dir: null
diff --git a/examples/opmd_gsm8k/train_opmd_gsm8k.yaml b/examples/opmd_gsm8k/train_opmd_gsm8k.yaml
index 22c49da486..326904d987 100644
--- a/examples/opmd_gsm8k/train_opmd_gsm8k.yaml
+++ b/examples/opmd_gsm8k/train_opmd_gsm8k.yaml
@@ -35,6 +35,11 @@ actor_rollout_ref:
     ppo_micro_batch_size_per_gpu: 4
     use_dynamic_bsz: True
     ppo_max_token_len_per_gpu: 16384 # n * ${data.max_prompt_length} + ${data.max_response_length}
+    grad_clip: 1.0
+    clip_ratio: 0.2
+    entropy_coeff: 0.000
+    use_kl_loss: True
+    kl_loss_coef: 0.001
     kl_loss_type: mse
     ppo_epochs: 1
     shuffle: False
@@ -74,6 +79,8 @@ custom_reward_function:
   name: compute_score
 
 algorithm:
+  gamma: 1.0
+  lam: 1.0
   kl_penalty: kl  # how to estimate kl divergence
   kl_ctrl:
     type: fixed
@@ -82,7 +89,6 @@ algorithm:
 trainer:
   balance_batch: True
   # total_training_steps: null
-  logger: [ 'console','wandb' ]
   # auto: find the last ckpt to resume. If can't find, start from scratch
   resume_mode: auto # or auto or resume_path if
   default_hdfs_dir: null
diff --git a/examples/ppo_countdown/train_countdown.yaml b/examples/ppo_countdown/train_countdown.yaml
index fbc0ecc1d0..ae16122ef7 100644
--- a/examples/ppo_countdown/train_countdown.yaml
+++ b/examples/ppo_countdown/train_countdown.yaml
@@ -11,6 +11,11 @@ actor_rollout_ref:
     ppo_micro_batch_size_per_gpu: 4
     use_dynamic_bsz: True
     ppo_max_token_len_per_gpu: 16384 # n * ${data.max_prompt_length} + ${data.max_response_length}
+    grad_clip: 1.0
+    clip_ratio: 0.2
+    entropy_coeff: 0.001
+    use_kl_loss: False # True for GRPO
+    kl_loss_coef: 0.001 # for grpo
     kl_loss_type: low_var_kl # for grpo
     ppo_epochs: 1
     shuffle: False
@@ -82,6 +87,8 @@ custom_reward_function:
   name: compute_score
 
 algorithm:
+  gamma: 1.0
+  lam: 1.0
   kl_penalty: kl  # how to estimate kl divergence
   kl_ctrl:
     type: fixed
@@ -90,7 +97,6 @@ algorithm:
 trainer:
   balance_batch: True
   # total_training_steps: null
-  logger: [ 'wandb' ]
   # auto: find the last ckpt to resume. If can't find, start from scratch
   resume_mode: auto # or auto or resume_path if
   critic_warmup: 0
diff --git a/tests/template/verl_config.yaml b/tests/template/verl_config.yaml
index 0b6330c17e..b17fc87958 100644
--- a/tests/template/verl_config.yaml
+++ b/tests/template/verl_config.yaml
@@ -11,6 +11,11 @@ actor_rollout_ref:
     ppo_micro_batch_size_per_gpu: 1
     use_dynamic_bsz: True
     ppo_max_token_len_per_gpu: 16384 # n * ${data.max_prompt_length} + ${data.max_response_length}
+    grad_clip: 1.0
+    clip_ratio: 0.2
+    entropy_coeff: 0.001
+    use_kl_loss: False # True for GRPO
+    kl_loss_coef: 0.001 # for grpo
     kl_loss_type: low_var_kl # for grpo
     ppo_epochs: 1
     shuffle: False
@@ -78,6 +83,8 @@ critic:
   cliprange_value: 0.5
 
 algorithm:
+  gamma: 1.0
+  lam: 1.0
   kl_penalty: kl  # how to estimate kl divergence
   kl_ctrl:
     type: fixed
@@ -86,7 +93,6 @@ algorithm:
 trainer:
   balance_batch: True
   # total_training_steps: null
-  logger: [ 'wandb' ]
   # auto: find the last ckpt to resume. If can't find, start from scratch
   resume_mode: auto # or auto or resume_path if
   critic_warmup: 0
diff --git a/trinity/common/config.py b/trinity/common/config.py
index db1e6b8712..e0660ab03a 100644
--- a/trinity/common/config.py
+++ b/trinity/common/config.py
@@ -173,8 +173,8 @@ class AlgorithmConfig:
     algorithm_type: AlgorithmType = AlgorithmType.PPO
     # for GRPO-like algorithms, repeat each task for `repeat_times` times
     repeat_times: int = 1
-    gamma: float = 1.0
-    lam: float = 1.0
+    gamma: Optional[float] = None
+    lam: Optional[float] = None
     # TODO: add more algorithm params here
 
 
@@ -263,11 +263,11 @@ class TrainerConfig:
     enable_preview: bool = True  # enable rollout preview in wandb
 
     # trainer configs
-    actor_use_kl_loss: bool = False
-    actor_kl_loss_coef: float = 0.001
-    actor_entropy_coef: float = 0.001
-    actor_grad_clip: float = 1.0
-    actor_clip_ratio: float = 0.2
+    actor_use_kl_loss: Optional[bool] = None
+    actor_kl_loss_coef: Optional[float] = None
+    actor_entropy_coef: Optional[float] = None
+    actor_grad_clip: Optional[float] = None
+    actor_clip_ratio: Optional[float] = None
     # TODO: extract more train-related params from underlying trainer engine
 
     # Only one needs to be set for `trainer_config` and `trainer_config_path`
diff --git a/trinity/common/verl_config.py b/trinity/common/verl_config.py
index 0b04ef4a6a..e5d0d9d55f 100644
--- a/trinity/common/verl_config.py
+++ b/trinity/common/verl_config.py
@@ -233,7 +233,7 @@ class veRLConfig:
     synchronizer: Optional[SynchronizerConfig] = None
     enable_preview: bool = True
 
-    def synchronize_config(self, config: Config) -> None:
+    def synchronize_config(self, config: Config) -> None:  # noqa: C901
         """Synchronize config."""
         if config.mode != "train":
             rollout_gpu_num = (
@@ -299,15 +299,22 @@ def synchronize_config(self, config: Config) -> None:
         self.critic.ppo_mini_batch_size = config.buffer.batch_size
         self.critic.rollout_n = self.actor_rollout_ref.rollout.n
 
-        self.actor_rollout_ref.actor.use_kl_loss = config.trainer.actor_use_kl_loss
-        self.actor_rollout_ref.actor.kl_loss_coef = config.trainer.actor_kl_loss_coef
-        self.actor_rollout_ref.actor.entropy_coeff = config.trainer.actor_entropy_coef
-        self.actor_rollout_ref.actor.grad_clip = config.trainer.actor_grad_clip
-        self.actor_rollout_ref.actor.clip_ratio = config.trainer.actor_clip_ratio
+        if config.trainer.actor_use_kl_loss is not None:
+            self.actor_rollout_ref.actor.use_kl_loss = config.trainer.actor_use_kl_loss
+        if config.trainer.actor_kl_loss_coef is not None:
+            self.actor_rollout_ref.actor.kl_loss_coef = config.trainer.actor_kl_loss_coef
+        if config.trainer.actor_entropy_coef is not None:
+            self.actor_rollout_ref.actor.entropy_coeff = config.trainer.actor_entropy_coef
+        if config.trainer.actor_grad_clip is not None:
+            self.actor_rollout_ref.actor.grad_clip = config.trainer.actor_grad_clip
+        if config.trainer.actor_clip_ratio is not None:
+            self.actor_rollout_ref.actor.clip_ratio = config.trainer.actor_clip_ratio
 
         # Algorithm related config
-        self.algorithm.gamma = config.algorithm.gamma
-        self.algorithm.lam = config.algorithm.lam
+        if config.algorithm.gamma is not None:
+            self.algorithm.gamma = config.algorithm.gamma
+        if config.algorithm.lam is not None:
+            self.algorithm.lam = config.algorithm.lam
         self.actor_rollout_ref.actor.algorithm_type = config.algorithm.algorithm_type
         if config.algorithm.algorithm_type == AlgorithmType.PPO:
             logger.info("Using GAE `adv_estimator` for PPO")