diff --git a/docs/sphinx_doc/source/tutorial/trinity_configs.md b/docs/sphinx_doc/source/tutorial/trinity_configs.md
index 255bb58018..fc599e206b 100644
--- a/docs/sphinx_doc/source/tutorial/trinity_configs.md
+++ b/docs/sphinx_doc/source/tutorial/trinity_configs.md
@@ -187,39 +187,30 @@ Support `nccl` and `checkpoint`, `nccl` represents that model weights in `explor
 ```yaml
 trainer:
   trainer_type: 'verl'
-  trainer_config_path: 'examples/ppo_countdown/train_countdown.yaml'
   save_interval: 100
+  trainer_config_path: 'examples/ppo_countdown/train_countdown.yaml'
 ```
 
 - `trainer.trainer_type`: The backend of the trainer, Only `verl` is supported.
-- `trainer.trainer_config_path`: The path to the trainer configuration file. It must be set manually.
 - `trainer.save_interval`: The interval steps between two checkpoints. Default is `100`.
 
+- `trainer.actor_grad_clip`: Gradient clip for actor model training.
+- `trainer.actor_clip_ratio`: Used for compute policy loss.
+- `trainer.actor_entropy_coeff`: Used for compute policy loss.
+- `trainer.actor_use_kl_loss`: Whether to enable kl loss.
+- `trainer.actor_kl_loss_coef`: The coefficient of kl loss.
+
+- `trainer.train_config`: The configuration of the trainer. Only one needs to be set for `trainer.trainer_config` and `trainer.trainer_config_path`
+- `trainer.trainer_config_path`: The path to the trainer configuration file. It must be set manually.
+
 ### veRL Trainer Configuration
 
 Here we mainly introduce the parameters that can be set in veRL. For the specific meaning of the parameters, please refer to the official document of [veRL](https://github.com/volcengine/verl/blob/0bdf7f469854815177e73dcfe9e420836c952e6e/docs/examples/config.rst).
 
 ```yaml
-data:
-  tokenizer: null
-  train_files: train_example.parquet
-  val_files: test_example.parquet
-  prompt_key: prompt
-  max_prompt_length: 256
-  max_response_length: 1024
-  train_batch_size: 256
-  val_batch_size: null
-  return_raw_input_ids: False  # This should be set to true when the tokenizer between policy and rm differs
-  return_raw_chat: False
-  shuffle: True
-  filter_overlong_prompts: False # for large-scale dataset, filtering overlong prompts could be timeconsuming. You should disable this and set `truncation='left'
-  truncation: error
-  image_key: images
-
 actor_rollout_ref:
   hybrid_engine: True
   model:
-    path: /PATH/TO/MODEL/CHECKPOINT/
     external_lib: null
     override_config: { }
     enable_gradient_checkpointing: True
@@ -270,35 +261,6 @@ actor_rollout_ref:
     log_prob_use_dynamic_bsz: ${actor_rollout_ref.actor.use_dynamic_bsz}
     log_prob_max_token_len_per_gpu: ${actor_rollout_ref.actor.ppo_max_token_len_per_gpu}
     ulysses_sequence_parallel_size: ${actor_rollout_ref.actor.ulysses_sequence_parallel_size} # sp size
-  rollout:
-    name: vllm
-    temperature: 1.0
-    top_k: -1 # 0 for hf rollout, -1 for vllm rollout
-    top_p: 1
-    use_fire_sampling: False # https://arxiv.org/abs/2410.21236
-    prompt_length: ${data.max_prompt_length}  # not use for opensource
-    response_length: ${data.max_response_length}
-    # for vllm rollout
-    dtype: bfloat16 # should align with FSDP
-    gpu_memory_utilization: 0.4
-    ignore_eos: False
-    enforce_eager: True
-    free_cache_engine: True
-    load_format: dummy_dtensor
-    tensor_model_parallel_size: 2
-    max_num_batched_tokens: 8192
-    max_model_len: null
-    max_num_seqs: 1024
-    # log_prob_micro_batch_size: 8 # will be deprecated, use log_prob_micro_batch_size_per_gpu
-    log_prob_micro_batch_size_per_gpu: 4
-    log_prob_use_dynamic_bsz: ${actor_rollout_ref.actor.use_dynamic_bsz}
-    log_prob_max_token_len_per_gpu: ${actor_rollout_ref.actor.ppo_max_token_len_per_gpu}
-    disable_log_stats: True
-    enable_chunked_prefill: True # could get higher throughput
-    # for hf rollout
-    do_sample: True
-    # number of responses (i.e. num sample times)
-    n: 1 # > 1 for grpo
 
 critic:
   strategy: fsdp
@@ -309,8 +271,6 @@ critic:
     warmup_style: constant  # select from constant/cosine
     total_training_steps: -1  # must be override by program
   model:
-    path: /PATH/TO/MODEL/CHECKPOINT/
-    tokenizer_path: ${actor_rollout_ref.model.path}
     override_config: { }
     external_lib: ${actor_rollout_ref.model.external_lib}
     enable_gradient_checkpointing: True
@@ -323,7 +283,6 @@ critic:
         min_num_params: 0
       fsdp_size: -1
   ppo_mini_batch_size: ${actor_rollout_ref.actor.ppo_mini_batch_size}
-  # ppo_micro_batch_size: 8 # will be deprecated, use ppo_micro_batch_size_per_gpu
   ppo_micro_batch_size_per_gpu: 8
   forward_micro_batch_size_per_gpu: ${critic.ppo_micro_batch_size_per_gpu}
   use_dynamic_bsz: ${actor_rollout_ref.actor.use_dynamic_bsz}
@@ -335,26 +294,6 @@ critic:
   grad_clip: 1.0
   cliprange_value: 0.5
 
-reward_model:
-  enable: False
-  strategy: fsdp
-  model:
-    input_tokenizer: ${actor_rollout_ref.model.path}  # set this to null if the chat template is identical
-    path: ~/models/FsfairX-LLaMA3-RM-v0.1
-    external_lib: ${actor_rollout_ref.model.external_lib}
-    use_remove_padding: False
-    fsdp_config:
-      min_num_params: 0
-      param_offload: False
-      fsdp_size: -1
-  # micro_batch_size: null # will be deprecated, use micro_batch_size_per_gpu
-  # micro_batch_size_per_gpu: 2 # set a number
-  # max_length: null
-  ulysses_sequence_parallel_size: 1 # sp size
-  use_dynamic_bsz: ${critic.use_dynamic_bsz}
-  forward_max_token_len_per_gpu: ${critic.forward_max_token_len_per_gpu}
-  reward_manager: tinyzero
-
 custom_reward_function:
   path: null
   name: compute_score
@@ -362,7 +301,6 @@ custom_reward_function:
 algorithm:
   gamma: 1.0
   lam: 1.0
-  adv_estimator: gae
   norm_adv_by_std_in_grpo: True
   use_kl_in_reward: False
   kl_penalty: kl  # how to estimate kl divergence
@@ -374,24 +312,14 @@ algorithm:
 
 trainer:
   balance_batch: True
-  total_epochs: 15
   # total_training_steps: null
-  project_name: TinyZero
-  experiment_name: trinity-qwen2.5-1.5b
-  logger: [ 'wandb' ]
-  val_generations_to_log_to_wandb: 0
-  nnodes: 1
-  n_gpus_per_node: 2
-  save_freq: 100
   # auto: find the last ckpt to resume. If can't find, start from scratch
   resume_mode: auto # or auto or resume_path if
   resume_from_path: ""
-  test_freq: 100
   critic_warmup: 0
   default_hdfs_dir: null
   remove_previous_ckpt_in_save: False
   del_local_ckpt_after_load: False
-  default_local_dir: checkpoints/${trainer.project_name}/${trainer.experiment_name}
   val_before_train: False
   max_actor_ckpt_to_keep: 5
   max_critic_ckpt_to_keep: 5
@@ -402,11 +330,6 @@ trainer:
 - `actor_rollout_ref.model.use_remove_padding`: Whether to remove pad tokens, which will reduce training time.
 - `actor_rollout_ref.actor.use_dynamic_bsz`: Whether to reorganize the batch data, specifically to splice the shorter data to reduce the batch size in the actual training process.
 - `actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu`: Batch size for one GPU in one forward pass.
-- `actor_rollout_ref.actor.grad_clip`: Gradient clip for actor model training.
-- `actor_rollout_ref.actor.clip_ratio`: Used for compute policy loss.
-- `actor_rollout_ref.actor.entropy_coeff`: Used for compute policy loss.
-- `actor_rollout_ref.actor.use_kl_loss`: Whether to enable kl loss.
-- `actor_rollout_ref.actor.kl_loss_coef`: The coefficient of kl loss.
 - `actor_rollout_ref.actor.kl_loss_type`: How to compute kl loss, optional value is `kl`, `abs`, `mse` or `low_var_kl`.
 - `actor_rollout_ref.actor.ulysses_sequence_parallel_size`: Ulysses sequence parallel size.
 - `actor_rollout_ref.actor.tau`: strength of regularization w.r.t. old / ref policy.
diff --git a/examples/async_gsm8k/verl_config.yaml b/examples/async_gsm8k/verl_config.yaml
index 268d61e0e5..de1b08f590 100644
--- a/examples/async_gsm8k/verl_config.yaml
+++ b/examples/async_gsm8k/verl_config.yaml
@@ -1,23 +1,6 @@
-data:
-  tokenizer: null
-  train_files: placeholder
-  val_files: placeholder
-  prompt_key: prompt
-  max_prompt_length: 256
-  max_response_length: 1024
-  train_batch_size: 256
-  val_batch_size: null
-  return_raw_input_ids: False  # This should be set to true when the tokenizer between policy and rm differs
-  return_raw_chat: False
-  shuffle: True
-  filter_overlong_prompts: False # for large-scale dataset, filtering overlong prompts could be timeconsuming. You should disable this and set `truncation='left'
-  truncation: error
-  image_key: images
-
 actor_rollout_ref:
   hybrid_engine: True
   model:
-    path: /PATH/TO/MODEL/
     external_lib: null
     override_config: { }
     enable_gradient_checkpointing: True
@@ -25,7 +8,6 @@ actor_rollout_ref:
   actor:
     strategy: fsdp  # This is for backward-compatibility
     ppo_mini_batch_size: 128
-    # ppo_micro_batch_size: 8 # will be deprecated, use ppo_micro_batch_size_per_gpu
     ppo_micro_batch_size_per_gpu: 4
     use_dynamic_bsz: True # False
     ppo_max_token_len_per_gpu: 16384 # n * ${data.max_prompt_length} + ${data.max_response_length}
@@ -61,92 +43,10 @@ actor_rollout_ref:
       wrap_policy:
         # transformer_layer_cls_to_wrap: None
         min_num_params: 0
-    # log_prob_micro_batch_size: 4 # will be deprecated, use log_prob_micro_batch_size_per_gpu
     log_prob_micro_batch_size_per_gpu: 16
     log_prob_use_dynamic_bsz: ${actor_rollout_ref.actor.use_dynamic_bsz}
     log_prob_max_token_len_per_gpu: ${actor_rollout_ref.actor.ppo_max_token_len_per_gpu}
     ulysses_sequence_parallel_size: ${actor_rollout_ref.actor.ulysses_sequence_parallel_size} # sp size
-  rollout:
-    name: vllm
-    temperature: 1.0
-    use_fire_sampling: False # https://arxiv.org/abs/2410.21236
-    prompt_length: ${data.max_prompt_length}  # not use for opensource
-    response_length: ${data.max_response_length}
-    # for vllm rollout
-    dtype: bfloat16 # should align with FSDP
-    gpu_memory_utilization: 0.4
-    ignore_eos: False
-    enforce_eager: True
-    free_cache_engine: True
-    load_format: dummy_dtensor
-    tensor_model_parallel_size: 2
-    max_num_batched_tokens: 8192
-    max_model_len: null
-    max_num_seqs: 1024
-    # log_prob_micro_batch_size: 8 # will be deprecated, use log_prob_micro_batch_size_per_gpu
-    log_prob_micro_batch_size_per_gpu: 4
-    log_prob_use_dynamic_bsz: ${actor_rollout_ref.actor.use_dynamic_bsz}
-    log_prob_max_token_len_per_gpu: ${actor_rollout_ref.actor.ppo_max_token_len_per_gpu}
-    disable_log_stats: True
-    enable_chunked_prefill: True # could get higher throughput
-    # for hf rollout
-    do_sample: True
-    # number of responses (i.e. num sample times)
-    n: 8 # > 1 for grpo
-
-critic:
-  strategy: fsdp
-  optim:
-    lr: 1e-5
-    lr_warmup_steps_ratio: 0.  # the total steps will be injected during runtime
-    # min_lr_ratio: null   # only useful for warmup with cosine
-    warmup_style: constant  # select from constant/cosine
-    total_training_steps: -1  # must be override by program
-  model:
-    path: /PATH/TO/MODEL/
-    tokenizer_path: ${actor_rollout_ref.model.path}
-    override_config: { }
-    external_lib: ${actor_rollout_ref.model.external_lib}
-    enable_gradient_checkpointing: True
-    use_remove_padding: False
-    fsdp_config:
-      param_offload: False
-      optimizer_offload: False
-      wrap_policy:
-        # transformer_layer_cls_to_wrap: None
-        min_num_params: 0
-      fsdp_size: -1
-  ppo_mini_batch_size: ${actor_rollout_ref.actor.ppo_mini_batch_size}
-  # ppo_micro_batch_size: 8 # will be deprecated, use ppo_micro_batch_size_per_gpu
-  ppo_micro_batch_size_per_gpu: 64
-  forward_micro_batch_size_per_gpu: ${critic.ppo_micro_batch_size_per_gpu}
-  use_dynamic_bsz: ${actor_rollout_ref.actor.use_dynamic_bsz}
-  ppo_max_token_len_per_gpu: 32768 # (${actor_rollout_ref.actor.ppo_max_token_len_per_gpu}) * 2
-  forward_max_token_len_per_gpu: ${critic.ppo_max_token_len_per_gpu}
-  ulysses_sequence_parallel_size: 1 # sp size
-  ppo_epochs: ${actor_rollout_ref.actor.ppo_epochs}
-  shuffle: ${actor_rollout_ref.actor.shuffle}
-  grad_clip: 1.0
-  cliprange_value: 0.5
-
-reward_model:
-  enable: False
-  strategy: fsdp
-  model:
-    input_tokenizer: ${actor_rollout_ref.model.path}  # set this to null if the chat template is identical
-    path: ~/models/FsfairX-LLaMA3-RM-v0.1
-    external_lib: ${actor_rollout_ref.model.external_lib}
-    use_remove_padding: False
-    fsdp_config:
-      min_num_params: 0
-      param_offload: False
-      fsdp_size: -1
-  # micro_batch_size: null # will be deprecated, use micro_batch_size_per_gpu
-  # micro_batch_size_per_gpu: 2 # set a number
-  # max_length: null
-  ulysses_sequence_parallel_size: 1 # sp size
-  use_dynamic_bsz: ${critic.use_dynamic_bsz}
-  forward_max_token_len_per_gpu: ${critic.forward_max_token_len_per_gpu}
 
 custom_reward_function:
   path: null
@@ -155,7 +55,6 @@ custom_reward_function:
 algorithm:
   gamma: 1.0
   lam: 1.0
-  adv_estimator: grpo
   kl_penalty: kl  # how to estimate kl divergence
   kl_ctrl:
     type: fixed
@@ -163,21 +62,10 @@ algorithm:
 
 trainer:
   balance_batch: True
-  total_epochs: 10
   # total_training_steps: null
-  project_name: rft_example_gsm8k
-  experiment_name: cys-qwen2_1.5b_rollout8_grpo_kl0.001_lr1e-5
-  logger: [ 'console','wandb' ]
-  val_generations_to_log_to_wandb: 0
-  nnodes: 1
-  n_gpus_per_node: 2
-  save_freq: 100
   # auto: find the last ckpt to resume. If can't find, start from scratch
   resume_mode: auto # or auto or resume_path if
-  test_freq: 5
-  critic_warmup: 0
   default_hdfs_dir: null
   remove_previous_ckpt_in_save: False
   del_local_ckpt_after_load: False
-  default_local_dir: checkpoints/${trainer.project_name}/${trainer.experiment_name}
   val_before_train: False
diff --git a/examples/dpo_humanlike/train_dpo.yaml b/examples/dpo_humanlike/train_dpo.yaml
index 09327877f9..8ffc68b397 100644
--- a/examples/dpo_humanlike/train_dpo.yaml
+++ b/examples/dpo_humanlike/train_dpo.yaml
@@ -1,23 +1,6 @@
-data:
-  tokenizer: null
-  train_files: /train.parquet  # useless
-  val_files: /test.parquet # useless
-  prompt_key: prompt
-  max_prompt_length: 1792
-  max_response_length: 256
-  train_batch_size: 32
-  val_batch_size: null
-  return_raw_input_ids: False  # This should be set to true when the tokenizer between policy and rm differs
-  return_raw_chat: False
-  shuffle: True
-  filter_overlong_prompts: False # for large-scale dataset, filtering overlong prompts could be timeconsuming. You should disable this and set `truncation='left'
-  truncation: error
-  image_key: images
-
 actor_rollout_ref:
   hybrid_engine: True
   model:
-    path: /PATH/TO/MODEL/CHECKPOINT/
     external_lib: null
     override_config: { }
     enable_gradient_checkpointing: True
@@ -25,7 +8,6 @@ actor_rollout_ref:
   actor:
     strategy: fsdp  # This is for backward-compatibility
     ppo_mini_batch_size: 32
-    # ppo_micro_batch_size: 8 # will be deprecated, use ppo_micro_batch_size_per_gpu
     ppo_micro_batch_size_per_gpu: 2 # NOTE
     use_dynamic_bsz: False
     ppo_max_token_len_per_gpu: 16384 # n * ${data.max_prompt_length} + ${data.max_response_length}
@@ -64,87 +46,6 @@ actor_rollout_ref:
     log_prob_use_dynamic_bsz: ${actor_rollout_ref.actor.use_dynamic_bsz}
     log_prob_max_token_len_per_gpu: ${actor_rollout_ref.actor.ppo_max_token_len_per_gpu}
     ulysses_sequence_parallel_size: ${actor_rollout_ref.actor.ulysses_sequence_parallel_size} # sp size
-  rollout:
-    name: vllm
-    temperature: 1.0
-    top_k: -1 # 0 for hf rollout, -1 for vllm rollout
-    top_p: 1
-    use_fire_sampling: False # https://arxiv.org/abs/2410.21236
-    prompt_length: ${data.max_prompt_length}  # not use for opensource
-    response_length: ${data.max_response_length}
-    # for vllm rollout
-    dtype: bfloat16 # should align with FSDP
-    gpu_memory_utilization: 0.4
-    ignore_eos: False
-    enforce_eager: True
-    free_cache_engine: True
-    load_format: dummy_dtensor
-    tensor_model_parallel_size: 2
-    max_num_batched_tokens: 8192
-    max_model_len: null
-    max_num_seqs: 1024
-    # log_prob_micro_batch_size: 8 # will be deprecated, use log_prob_micro_batch_size_per_gpu
-    log_prob_micro_batch_size_per_gpu: 4
-    log_prob_use_dynamic_bsz: ${actor_rollout_ref.actor.use_dynamic_bsz}
-    log_prob_max_token_len_per_gpu: ${actor_rollout_ref.actor.ppo_max_token_len_per_gpu}
-    disable_log_stats: True
-    enable_chunked_prefill: True # could get higher throughput
-    # for hf rollout
-    do_sample: True
-
-critic:
-  strategy: fsdp
-  optim:
-    lr: 1e-5
-    lr_warmup_steps_ratio: 0.  # the total steps will be injected during runtime
-    # min_lr_ratio: null   # only useful for warmup with cosine
-    warmup_style: constant  # select from constant/cosine
-    total_training_steps: 783  # must be override by program
-  model:
-    path: /PATH/TO/MODEL/CHECKPOINT/
-    tokenizer_path: ${actor_rollout_ref.model.path}
-    override_config: { }
-    external_lib: ${actor_rollout_ref.model.external_lib}
-    enable_gradient_checkpointing: True
-    use_remove_padding: False
-    fsdp_config:
-      param_offload: False
-      optimizer_offload: False
-      wrap_policy:
-        # transformer_layer_cls_to_wrap: None
-        min_num_params: 0
-      fsdp_size: -1
-  ppo_mini_batch_size: ${actor_rollout_ref.actor.ppo_mini_batch_size}
-  # ppo_micro_batch_size: 8 # will be deprecated, use ppo_micro_batch_size_per_gpu
-  ppo_micro_batch_size_per_gpu: 1
-  forward_micro_batch_size_per_gpu: ${critic.ppo_micro_batch_size_per_gpu}
-  use_dynamic_bsz: ${actor_rollout_ref.actor.use_dynamic_bsz}
-  ppo_max_token_len_per_gpu: 32768 # (${actor_rollout_ref.actor.ppo_max_token_len_per_gpu}) * 2
-  forward_max_token_len_per_gpu: ${critic.ppo_max_token_len_per_gpu}
-  ulysses_sequence_parallel_size: 1 # sp size
-  ppo_epochs: ${actor_rollout_ref.actor.ppo_epochs}
-  shuffle: ${actor_rollout_ref.actor.shuffle}
-  grad_clip: 1.0
-  cliprange_value: 0.5
-
-reward_model:
-  enable: False
-  strategy: fsdp
-  model:
-    input_tokenizer: ${actor_rollout_ref.model.path}  # set this to null if the chat template is identical
-    path: ~/models/FsfairX-LLaMA3-RM-v0.1
-    external_lib: ${actor_rollout_ref.model.external_lib}
-    use_remove_padding: False
-    fsdp_config:
-      min_num_params: 0
-      param_offload: False
-      fsdp_size: -1
-  # micro_batch_size: null # will be deprecated, use micro_batch_size_per_gpu
-  # micro_batch_size_per_gpu: 2 # set a number
-  # max_length: null
-  ulysses_sequence_parallel_size: 1 # sp size
-  use_dynamic_bsz: ${critic.use_dynamic_bsz}
-  forward_max_token_len_per_gpu: ${critic.forward_max_token_len_per_gpu}
 
 custom_reward_function:
   path: null
@@ -153,7 +54,6 @@ custom_reward_function:
 algorithm:
   gamma: 1.0
   lam: 1.0
-  adv_estimator: grpo
   kl_penalty: kl
   kl_ctrl:
     type: fixed
@@ -161,20 +61,10 @@ algorithm:
 
 trainer:
   balance_batch: False
-  total_epochs: 1  #
   total_training_steps: 783 #
-  project_name: dpo_example
-  experiment_name: trinity_dpo
-  logger: [ 'console','wandb' ]
-  val_generations_to_log_to_wandb: 0
-  nnodes: 1
-  n_gpus_per_node: 2
   # auto: find the last ckpt to resume. If can't find, start from scratch
   resume_mode: auto # or auto or resume_path if
-  test_freq: 5
-  critic_warmup: 0
   default_hdfs_dir: null
   remove_previous_ckpt_in_save: False
   del_local_ckpt_after_load: False
-  default_local_dir: checkpoints/${trainer.project_name}/${trainer.experiment_name}
   val_before_train: False
diff --git a/examples/grpo_alfworld/train_alfworld.yaml b/examples/grpo_alfworld/train_alfworld.yaml
index a210c39916..215b1817ab 100644
--- a/examples/grpo_alfworld/train_alfworld.yaml
+++ b/examples/grpo_alfworld/train_alfworld.yaml
@@ -1,23 +1,6 @@
-data:
-  tokenizer: null
-  train_files: train_example.parquet
-  val_files: test_example.parquet
-  prompt_key: prompt
-  max_prompt_length: 4096
-  max_response_length: 16384
-  train_batch_size: 96
-  val_batch_size: null
-  return_raw_input_ids: False  # This should be set to true when the tokenizer between policy and rm differs
-  return_raw_chat: False
-  shuffle: True
-  filter_overlong_prompts: False # for large-scale dataset, filtering overlong prompts could be timeconsuming. You should disable this and set `truncation='left'
-  truncation: error
-  image_key: images
-
 actor_rollout_ref:
   hybrid_engine: True
   model:
-    path: /PATH/TO/MODEL/CHECKPOINT/
     external_lib: null
     override_config: { }
     enable_gradient_checkpointing: True
@@ -25,7 +8,6 @@ actor_rollout_ref:
   actor:
     strategy: fsdp  # This is for backward-compatibility
     ppo_mini_batch_size: 1536
-    # ppo_micro_batch_size: 8 # will be deprecated, use ppo_micro_batch_size_per_gpu
     ppo_micro_batch_size_per_gpu: 1
     use_dynamic_bsz: False
     ppo_max_token_len_per_gpu: 16384 # n * ${data.max_prompt_length} + ${data.max_response_length}
@@ -57,94 +39,10 @@ actor_rollout_ref:
       wrap_policy:
         # transformer_layer_cls_to_wrap: None
         min_num_params: 0
-    # log_prob_micro_batch_size: 4 # will be deprecated, use log_prob_micro_batch_size_per_gpu
     log_prob_micro_batch_size_per_gpu: 1
     log_prob_use_dynamic_bsz: ${actor_rollout_ref.actor.use_dynamic_bsz}
     log_prob_max_token_len_per_gpu: ${actor_rollout_ref.actor.ppo_max_token_len_per_gpu}
     ulysses_sequence_parallel_size: ${actor_rollout_ref.actor.ulysses_sequence_parallel_size} # sp size
-  rollout:
-    name: vllm
-    temperature: 1.0
-    top_k: -1 # 0 for hf rollout, -1 for vllm rollout
-    top_p: 1
-    use_fire_sampling: False # https://arxiv.org/abs/2410.21236
-    prompt_length: ${data.max_prompt_length}  # not use for opensource
-    response_length: ${data.max_response_length}
-    # for vllm rollout
-    dtype: bfloat16 # should align with FSDP
-    gpu_memory_utilization: 0.4
-    ignore_eos: False
-    enforce_eager: True
-    free_cache_engine: True
-    load_format: dummy_dtensor
-    tensor_model_parallel_size: 1
-    max_num_batched_tokens: 8192
-    max_model_len: null
-    max_num_seqs: 1024
-    # log_prob_micro_batch_size: 8 # will be deprecated, use log_prob_micro_batch_size_per_gpu
-    log_prob_micro_batch_size_per_gpu: 1
-    log_prob_use_dynamic_bsz: ${actor_rollout_ref.actor.use_dynamic_bsz}
-    log_prob_max_token_len_per_gpu: ${actor_rollout_ref.actor.ppo_max_token_len_per_gpu}
-    disable_log_stats: True
-    enable_chunked_prefill: True # could get higher throughput
-    # for hf rollout
-    do_sample: True
-    # number of responses (i.e. num sample times)
-    n: 8 # should be > 1 for grpo; Currently is unused parameter
-
-critic:
-  strategy: fsdp
-  optim:
-    lr: 1e-5
-    lr_warmup_steps_ratio: 0.  # the total steps will be injected during runtime
-    # min_lr_ratio: null   # only useful for warmup with cosine
-    warmup_style: constant  # select from constant/cosine
-    total_training_steps: -1  # must be override by program
-  model:
-    path: /PATH/TO/MODEL/CHECKPOINT/
-    tokenizer_path: ${actor_rollout_ref.model.path}
-    override_config: { }
-    external_lib: ${actor_rollout_ref.model.external_lib}
-    enable_gradient_checkpointing: True
-    use_remove_padding: False
-    fsdp_config:
-      param_offload: False
-      optimizer_offload: False
-      wrap_policy:
-        # transformer_layer_cls_to_wrap: None
-        min_num_params: 0
-      fsdp_size: -1
-  ppo_mini_batch_size: ${actor_rollout_ref.actor.ppo_mini_batch_size}
-  # ppo_micro_batch_size: 8 # will be deprecated, use ppo_micro_batch_size_per_gpu
-  ppo_micro_batch_size_per_gpu: 1
-  forward_micro_batch_size_per_gpu: ${critic.ppo_micro_batch_size_per_gpu}
-  use_dynamic_bsz: ${actor_rollout_ref.actor.use_dynamic_bsz}
-  ppo_max_token_len_per_gpu: 16384 # (${actor_rollout_ref.actor.ppo_max_token_len_per_gpu}) * 2
-  forward_max_token_len_per_gpu: ${critic.ppo_max_token_len_per_gpu}
-  ulysses_sequence_parallel_size: 1 # sp size
-  ppo_epochs: ${actor_rollout_ref.actor.ppo_epochs}
-  shuffle: ${actor_rollout_ref.actor.shuffle}
-  grad_clip: 1.0
-  cliprange_value: 0.5
-
-reward_model:
-  enable: False
-  strategy: fsdp
-  model:
-    input_tokenizer: ${actor_rollout_ref.model.path}  # set this to null if the chat template is identical
-    path: ~/models/FsfairX-LLaMA3-RM-v0.1
-    external_lib: ${actor_rollout_ref.model.external_lib}
-    use_remove_padding: False
-    fsdp_config:
-      min_num_params: 0
-      param_offload: False
-      fsdp_size: -1
-  # micro_batch_size: null # will be deprecated, use micro_batch_size_per_gpu
-  # micro_batch_size_per_gpu: 2 # set a number
-  # max_length: null
-  ulysses_sequence_parallel_size: 1 # sp size
-  use_dynamic_bsz: ${critic.use_dynamic_bsz}
-  forward_max_token_len_per_gpu: ${critic.forward_max_token_len_per_gpu}
 
 custom_reward_function:
   path: null
@@ -153,7 +51,6 @@ custom_reward_function:
 algorithm:
   gamma: 1.0
   lam: 1.0
-  adv_estimator: grpo
   kl_penalty: kl  # how to estimate kl divergence
   kl_ctrl:
     type: fixed
@@ -161,20 +58,10 @@ algorithm:
 
 trainer:
   balance_batch: True
-  total_epochs: 15
   # total_training_steps: null
-  project_name: ALFWORLD
-  experiment_name: ALFWORLD_RFT
-  logger: [ 'wandb' ]
-  val_generations_to_log_to_wandb: 0
-  nnodes: 1
-  n_gpus_per_node: 2
   # auto: find the last ckpt to resume. If can't find, start from scratch
   resume_mode: auto # or auto or resume_path if
-  test_freq: 100
-  critic_warmup: 0
   default_hdfs_dir: null
   remove_previous_ckpt_in_save: False
   del_local_ckpt_after_load: False
-  default_local_dir: checkpoints/${trainer.project_name}/${trainer.experiment_name}
   val_before_train: False
diff --git a/examples/grpo_gsm8k/train_gsm8k.yaml b/examples/grpo_gsm8k/train_gsm8k.yaml
index 13b195f557..de1b08f590 100644
--- a/examples/grpo_gsm8k/train_gsm8k.yaml
+++ b/examples/grpo_gsm8k/train_gsm8k.yaml
@@ -1,23 +1,6 @@
-data:
-  tokenizer: null
-  train_files: train_example.parquet
-  val_files: test_example.parquet
-  prompt_key: prompt
-  max_prompt_length: 256
-  max_response_length: 1024
-  train_batch_size: 256
-  val_batch_size: null
-  return_raw_input_ids: False  # This should be set to true when the tokenizer between policy and rm differs
-  return_raw_chat: False
-  shuffle: True
-  filter_overlong_prompts: False # for large-scale dataset, filtering overlong prompts could be timeconsuming. You should disable this and set `truncation='left'
-  truncation: error
-  image_key: images
-
 actor_rollout_ref:
   hybrid_engine: True
   model:
-    path: /PATH/TO/MODEL/CHECKPOINT/
     external_lib: null
     override_config: { }
     enable_gradient_checkpointing: True
@@ -25,7 +8,6 @@ actor_rollout_ref:
   actor:
     strategy: fsdp  # This is for backward-compatibility
     ppo_mini_batch_size: 128
-    # ppo_micro_batch_size: 8 # will be deprecated, use ppo_micro_batch_size_per_gpu
     ppo_micro_batch_size_per_gpu: 4
     use_dynamic_bsz: True # False
     ppo_max_token_len_per_gpu: 16384 # n * ${data.max_prompt_length} + ${data.max_response_length}
@@ -61,94 +43,10 @@ actor_rollout_ref:
       wrap_policy:
         # transformer_layer_cls_to_wrap: None
         min_num_params: 0
-    # log_prob_micro_batch_size: 4 # will be deprecated, use log_prob_micro_batch_size_per_gpu
     log_prob_micro_batch_size_per_gpu: 16
     log_prob_use_dynamic_bsz: ${actor_rollout_ref.actor.use_dynamic_bsz}
     log_prob_max_token_len_per_gpu: ${actor_rollout_ref.actor.ppo_max_token_len_per_gpu}
     ulysses_sequence_parallel_size: ${actor_rollout_ref.actor.ulysses_sequence_parallel_size} # sp size
-  rollout:
-    name: vllm
-    temperature: 1.0
-    top_k: -1 # 0 for hf rollout, -1 for vllm rollout
-    top_p: 1
-    use_fire_sampling: False # https://arxiv.org/abs/2410.21236
-    prompt_length: ${data.max_prompt_length}  # not use for opensource
-    response_length: ${data.max_response_length}
-    # for vllm rollout
-    dtype: bfloat16 # should align with FSDP
-    gpu_memory_utilization: 0.4
-    ignore_eos: False
-    enforce_eager: True
-    free_cache_engine: True
-    load_format: dummy_dtensor
-    tensor_model_parallel_size: 2
-    max_num_batched_tokens: 8192
-    max_model_len: null
-    max_num_seqs: 1024
-    # log_prob_micro_batch_size: 8 # will be deprecated, use log_prob_micro_batch_size_per_gpu
-    log_prob_micro_batch_size_per_gpu: 4
-    log_prob_use_dynamic_bsz: ${actor_rollout_ref.actor.use_dynamic_bsz}
-    log_prob_max_token_len_per_gpu: ${actor_rollout_ref.actor.ppo_max_token_len_per_gpu}
-    disable_log_stats: True
-    enable_chunked_prefill: True # could get higher throughput
-    # for hf rollout
-    do_sample: True
-    # number of responses (i.e. num sample times)
-    n: 8 # > 1 for grpo
-
-critic:
-  strategy: fsdp
-  optim:
-    lr: 1e-5
-    lr_warmup_steps_ratio: 0.  # the total steps will be injected during runtime
-    # min_lr_ratio: null   # only useful for warmup with cosine
-    warmup_style: constant  # select from constant/cosine
-    total_training_steps: -1  # must be override by program
-  model:
-    path: /PATH/TO/MODEL/CHECKPOINT/
-    tokenizer_path: ${actor_rollout_ref.model.path}
-    override_config: { }
-    external_lib: ${actor_rollout_ref.model.external_lib}
-    enable_gradient_checkpointing: True
-    use_remove_padding: False
-    fsdp_config:
-      param_offload: False
-      optimizer_offload: False
-      wrap_policy:
-        # transformer_layer_cls_to_wrap: None
-        min_num_params: 0
-      fsdp_size: -1
-  ppo_mini_batch_size: ${actor_rollout_ref.actor.ppo_mini_batch_size}
-  # ppo_micro_batch_size: 8 # will be deprecated, use ppo_micro_batch_size_per_gpu
-  ppo_micro_batch_size_per_gpu: 64
-  forward_micro_batch_size_per_gpu: ${critic.ppo_micro_batch_size_per_gpu}
-  use_dynamic_bsz: ${actor_rollout_ref.actor.use_dynamic_bsz}
-  ppo_max_token_len_per_gpu: 32768 # (${actor_rollout_ref.actor.ppo_max_token_len_per_gpu}) * 2
-  forward_max_token_len_per_gpu: ${critic.ppo_max_token_len_per_gpu}
-  ulysses_sequence_parallel_size: 1 # sp size
-  ppo_epochs: ${actor_rollout_ref.actor.ppo_epochs}
-  shuffle: ${actor_rollout_ref.actor.shuffle}
-  grad_clip: 1.0
-  cliprange_value: 0.5
-
-reward_model:
-  enable: False
-  strategy: fsdp
-  model:
-    input_tokenizer: ${actor_rollout_ref.model.path}  # set this to null if the chat template is identical
-    path: ~/models/FsfairX-LLaMA3-RM-v0.1
-    external_lib: ${actor_rollout_ref.model.external_lib}
-    use_remove_padding: False
-    fsdp_config:
-      min_num_params: 0
-      param_offload: False
-      fsdp_size: -1
-  # micro_batch_size: null # will be deprecated, use micro_batch_size_per_gpu
-  # micro_batch_size_per_gpu: 2 # set a number
-  # max_length: null
-  ulysses_sequence_parallel_size: 1 # sp size
-  use_dynamic_bsz: ${critic.use_dynamic_bsz}
-  forward_max_token_len_per_gpu: ${critic.forward_max_token_len_per_gpu}
 
 custom_reward_function:
   path: null
@@ -157,7 +55,6 @@ custom_reward_function:
 algorithm:
   gamma: 1.0
   lam: 1.0
-  adv_estimator: grpo
   kl_penalty: kl  # how to estimate kl divergence
   kl_ctrl:
     type: fixed
@@ -165,20 +62,10 @@ algorithm:
 
 trainer:
   balance_batch: True
-  total_epochs: 10
   # total_training_steps: null
-  project_name: rft_example_gsm8k
-  experiment_name: cys-qwen2_1.5b_rollout8_grpo_kl0.001_lr1e-5
-  logger: [ 'console','wandb' ]
-  val_generations_to_log_to_wandb: 0
-  nnodes: 1
-  n_gpus_per_node: 2
   # auto: find the last ckpt to resume. If can't find, start from scratch
   resume_mode: auto # or auto or resume_path if
-  test_freq: 5
-  critic_warmup: 0
   default_hdfs_dir: null
   remove_previous_ckpt_in_save: False
   del_local_ckpt_after_load: False
-  default_local_dir: checkpoints/${trainer.project_name}/${trainer.experiment_name}
   val_before_train: False
diff --git a/examples/grpo_math/train_math.yaml b/examples/grpo_math/train_math.yaml
index 2482ccc785..78bcb862c6 100644
--- a/examples/grpo_math/train_math.yaml
+++ b/examples/grpo_math/train_math.yaml
@@ -1,23 +1,6 @@
-data:
-  tokenizer: null
-  train_files: train_example.parquet
-  val_files: test_example.parquet
-  prompt_key: prompt
-  max_prompt_length: 1024
-  max_response_length: 2048
-  # train_batch_size: 256
-  val_batch_size: null
-  return_raw_input_ids: False  # This should be set to true when the tokenizer between policy and rm differs
-  return_raw_chat: False
-  shuffle: True
-  filter_overlong_prompts: False # for large-scale dataset, filtering overlong prompts could be timeconsuming. You should disable this and set `truncation='left'
-  truncation: error
-  image_key: images
-
 actor_rollout_ref:
   hybrid_engine: True
   model:
-    path: /PATH/TO/MODEL/
     external_lib: null
     override_config: { }
     enable_gradient_checkpointing: True
@@ -64,84 +47,6 @@ actor_rollout_ref:
     log_prob_use_dynamic_bsz: ${actor_rollout_ref.actor.use_dynamic_bsz}
     log_prob_max_token_len_per_gpu: ${actor_rollout_ref.actor.ppo_max_token_len_per_gpu}
     ulysses_sequence_parallel_size: ${actor_rollout_ref.actor.ulysses_sequence_parallel_size} # sp size
-  rollout:
-    name: vllm
-    temperature: 1.0
-    top_k: -1 # 0 for hf rollout, -1 for vllm rollout
-    top_p: 1
-    use_fire_sampling: False # https://arxiv.org/abs/2410.21236
-    prompt_length: ${data.max_prompt_length}  # not use for opensource
-    response_length: ${data.max_response_length}
-    # for vllm rollout
-    dtype: bfloat16 # should align with FSDP
-    gpu_memory_utilization: 0.4
-    ignore_eos: False
-    enforce_eager: True
-    free_cache_engine: True
-    load_format: dummy_dtensor
-    tensor_model_parallel_size: 2
-    max_num_batched_tokens: 8192
-    max_model_len: null
-    max_num_seqs: 1024
-    log_prob_micro_batch_size_per_gpu: 4
-    log_prob_use_dynamic_bsz: ${actor_rollout_ref.actor.use_dynamic_bsz}
-    log_prob_max_token_len_per_gpu: ${actor_rollout_ref.actor.ppo_max_token_len_per_gpu}
-    disable_log_stats: True
-    enable_chunked_prefill: True # could get higher throughput
-    # for hf rollout
-    do_sample: True
-    # number of responses (i.e. num sample times)
-    n: 8 # > 1 for grpo
-
-critic:
-  strategy: fsdp
-  optim:
-    lr: 1e-5
-    lr_warmup_steps_ratio: 0.  # the total steps will be injected during runtime
-    # min_lr_ratio: null   # only useful for warmup with cosine
-    warmup_style: constant  # select from constant/cosine
-    total_training_steps: -1  # must be override by program
-  model:
-    path: /PATH/TO/MODEL/
-    tokenizer_path: ${actor_rollout_ref.model.path}
-    override_config: { }
-    external_lib: ${actor_rollout_ref.model.external_lib}
-    enable_gradient_checkpointing: True
-    use_remove_padding: False
-    fsdp_config:
-      param_offload: False
-      optimizer_offload: False
-      wrap_policy:
-        # transformer_layer_cls_to_wrap: None
-        min_num_params: 0
-      fsdp_size: -1
-  ppo_mini_batch_size: ${actor_rollout_ref.actor.ppo_mini_batch_size}
-  ppo_micro_batch_size_per_gpu: 64
-  forward_micro_batch_size_per_gpu: ${critic.ppo_micro_batch_size_per_gpu}
-  use_dynamic_bsz: ${actor_rollout_ref.actor.use_dynamic_bsz}
-  ppo_max_token_len_per_gpu: 32768 # (${actor_rollout_ref.actor.ppo_max_token_len_per_gpu}) * 2
-  forward_max_token_len_per_gpu: ${critic.ppo_max_token_len_per_gpu}
-  ulysses_sequence_parallel_size: 1 # sp size
-  ppo_epochs: ${actor_rollout_ref.actor.ppo_epochs}
-  shuffle: ${actor_rollout_ref.actor.shuffle}
-  grad_clip: 1.0
-  cliprange_value: 0.5
-
-reward_model:
-  enable: False
-  strategy: fsdp
-  model:
-    input_tokenizer: ${actor_rollout_ref.model.path}  # set this to null if the chat template is identical
-    path: ~/models/FsfairX-LLaMA3-RM-v0.1
-    external_lib: ${actor_rollout_ref.model.external_lib}
-    use_remove_padding: False
-    fsdp_config:
-      min_num_params: 0
-      param_offload: False
-      fsdp_size: -1
-  ulysses_sequence_parallel_size: 1 # sp size
-  use_dynamic_bsz: ${critic.use_dynamic_bsz}
-  forward_max_token_len_per_gpu: ${critic.forward_max_token_len_per_gpu}
 
 custom_reward_function:
   path: null
@@ -150,7 +55,6 @@ custom_reward_function:
 algorithm:
   gamma: 1.0
   lam: 1.0
-  adv_estimator: grpo
   kl_penalty: kl  # how to estimate kl divergence
   kl_ctrl:
     type: fixed
@@ -158,19 +62,9 @@ algorithm:
 
 trainer:
   balance_batch: True
-  total_epochs: 20
-  project_name: grpo_math
-  experiment_name: grpo_math_example
-  logger: [ 'console','wandb' ]
-  val_generations_to_log_to_wandb: 0
-  nnodes: 1
-  n_gpus_per_node: 2
   # auto: find the last ckpt to resume. If can't find, start from scratch
   resume_mode: auto # or auto or resume_path if
-  test_freq: 5
-  critic_warmup: 0
   default_hdfs_dir: null
   remove_previous_ckpt_in_save: False
   del_local_ckpt_after_load: False
-  default_local_dir: checkpoints/${trainer.project_name}/${trainer.experiment_name}
   val_before_train: False
diff --git a/examples/grpo_sciworld/train_sciworld.yaml b/examples/grpo_sciworld/train_sciworld.yaml
index 833441142c..215b1817ab 100644
--- a/examples/grpo_sciworld/train_sciworld.yaml
+++ b/examples/grpo_sciworld/train_sciworld.yaml
@@ -1,23 +1,6 @@
-data:
-  tokenizer: null
-  train_files: train_example.parquet
-  val_files: test_example.parquet
-  prompt_key: prompt
-  max_prompt_length: 4096
-  max_response_length: 16384
-  train_batch_size: 96
-  val_batch_size: null
-  return_raw_input_ids: False  # This should be set to true when the tokenizer between policy and rm differs
-  return_raw_chat: False
-  shuffle: True
-  filter_overlong_prompts: False # for large-scale dataset, filtering overlong prompts could be timeconsuming. You should disable this and set `truncation='left'
-  truncation: error
-  image_key: images
-
 actor_rollout_ref:
   hybrid_engine: True
   model:
-    path: /PATH/TO/MODEL/CHECKPOINT/
     external_lib: null
     override_config: { }
     enable_gradient_checkpointing: True
@@ -60,86 +43,6 @@ actor_rollout_ref:
     log_prob_use_dynamic_bsz: ${actor_rollout_ref.actor.use_dynamic_bsz}
     log_prob_max_token_len_per_gpu: ${actor_rollout_ref.actor.ppo_max_token_len_per_gpu}
     ulysses_sequence_parallel_size: ${actor_rollout_ref.actor.ulysses_sequence_parallel_size} # sp size
-  rollout:
-    name: vllm
-    temperature: 1.0
-    top_k: -1 # 0 for hf rollout, -1 for vllm rollout
-    top_p: 1
-    use_fire_sampling: False # https://arxiv.org/abs/2410.21236
-    prompt_length: ${data.max_prompt_length}  # not use for opensource
-    response_length: ${data.max_response_length}
-    # for vllm rollout
-    dtype: bfloat16 # should align with FSDP
-    gpu_memory_utilization: 0.4
-    ignore_eos: False
-    enforce_eager: True
-    free_cache_engine: True
-    load_format: dummy_dtensor
-    tensor_model_parallel_size: 1
-    max_num_batched_tokens: 8192
-    max_model_len: null
-    max_num_seqs: 1024
-    log_prob_micro_batch_size_per_gpu: 1
-    log_prob_use_dynamic_bsz: ${actor_rollout_ref.actor.use_dynamic_bsz}
-    log_prob_max_token_len_per_gpu: ${actor_rollout_ref.actor.ppo_max_token_len_per_gpu}
-    disable_log_stats: True
-    enable_chunked_prefill: True # could get higher throughput
-    # for hf rollout
-    do_sample: True
-    # number of responses (i.e. num sample times)
-    n: 8 # should be > 1 for grpo; Currently is unused parameter
-
-critic:
-  strategy: fsdp
-  optim:
-    lr: 1e-5
-    lr_warmup_steps_ratio: 0.  # the total steps will be injected during runtime
-    # min_lr_ratio: null   # only useful for warmup with cosine
-    warmup_style: constant  # select from constant/cosine
-    total_training_steps: -1  # must be override by program
-  model:
-    path: /PATH/TO/MODEL/CHECKPOINT/
-    tokenizer_path: ${actor_rollout_ref.model.path}
-    override_config: { }
-    external_lib: ${actor_rollout_ref.model.external_lib}
-    enable_gradient_checkpointing: True
-    use_remove_padding: False
-    fsdp_config:
-      param_offload: False
-      optimizer_offload: False
-      wrap_policy:
-        # transformer_layer_cls_to_wrap: None
-        min_num_params: 0
-      fsdp_size: -1
-  ppo_mini_batch_size: ${actor_rollout_ref.actor.ppo_mini_batch_size}
-  ppo_micro_batch_size_per_gpu: 1
-  forward_micro_batch_size_per_gpu: ${critic.ppo_micro_batch_size_per_gpu}
-  use_dynamic_bsz: ${actor_rollout_ref.actor.use_dynamic_bsz}
-  ppo_max_token_len_per_gpu: 16384 # (${actor_rollout_ref.actor.ppo_max_token_len_per_gpu}) * 2
-  forward_max_token_len_per_gpu: ${critic.ppo_max_token_len_per_gpu}
-  ulysses_sequence_parallel_size: 1 # sp size
-  ppo_epochs: ${actor_rollout_ref.actor.ppo_epochs}
-  shuffle: ${actor_rollout_ref.actor.shuffle}
-  grad_clip: 1.0
-  cliprange_value: 0.5
-
-reward_model:
-  enable: False
-  strategy: fsdp
-  model:
-    input_tokenizer: ${actor_rollout_ref.model.path}  # set this to null if the chat template is identical
-    path: ~/models/FsfairX-LLaMA3-RM-v0.1
-    external_lib: ${actor_rollout_ref.model.external_lib}
-    use_remove_padding: False
-    fsdp_config:
-      min_num_params: 0
-      param_offload: False
-      fsdp_size: -1
-  # micro_batch_size_per_gpu: 2 # set a number
-  # max_length: null
-  ulysses_sequence_parallel_size: 1 # sp size
-  use_dynamic_bsz: ${critic.use_dynamic_bsz}
-  forward_max_token_len_per_gpu: ${critic.forward_max_token_len_per_gpu}
 
 custom_reward_function:
   path: null
@@ -148,7 +51,6 @@ custom_reward_function:
 algorithm:
   gamma: 1.0
   lam: 1.0
-  adv_estimator: grpo
   kl_penalty: kl  # how to estimate kl divergence
   kl_ctrl:
     type: fixed
@@ -156,20 +58,10 @@ algorithm:
 
 trainer:
   balance_batch: True
-  total_epochs: 15
   # total_training_steps: null
-  project_name: sciworld
-  experiment_name: sciworld_RFT
-  logger: [ 'wandb' ]
-  val_generations_to_log_to_wandb: 0
-  nnodes: 1
-  n_gpus_per_node: 2
   # auto: find the last ckpt to resume. If can't find, start from scratch
   resume_mode: auto # or auto or resume_path if
-  test_freq: 100
-  critic_warmup: 0
   default_hdfs_dir: null
   remove_previous_ckpt_in_save: False
   del_local_ckpt_after_load: False
-  default_local_dir: checkpoints/${trainer.project_name}/${trainer.experiment_name}
   val_before_train: False
diff --git a/examples/grpo_webshop/train_webshop.yaml b/examples/grpo_webshop/train_webshop.yaml
index ac502fec3f..215b1817ab 100644
--- a/examples/grpo_webshop/train_webshop.yaml
+++ b/examples/grpo_webshop/train_webshop.yaml
@@ -1,23 +1,6 @@
-data:
-  tokenizer: null
-  train_files: train_example.parquet
-  val_files: test_example.parquet
-  prompt_key: prompt
-  max_prompt_length: 4096
-  max_response_length: 16384
-  train_batch_size: 96
-  val_batch_size: null
-  return_raw_input_ids: False  # This should be set to true when the tokenizer between policy and rm differs
-  return_raw_chat: False
-  shuffle: True
-  filter_overlong_prompts: False # for large-scale dataset, filtering overlong prompts could be timeconsuming. You should disable this and set `truncation='left'
-  truncation: error
-  image_key: images
-
 actor_rollout_ref:
   hybrid_engine: True
   model:
-    path: /PATH/TO/MODEL/CHECKPOINT/
     external_lib: null
     override_config: { }
     enable_gradient_checkpointing: True
@@ -25,7 +8,6 @@ actor_rollout_ref:
   actor:
     strategy: fsdp  # This is for backward-compatibility
     ppo_mini_batch_size: 1536
-    # ppo_micro_batch_size: 8 # will be deprecated, use ppo_micro_batch_size_per_gpu
     ppo_micro_batch_size_per_gpu: 1
     use_dynamic_bsz: False
     ppo_max_token_len_per_gpu: 16384 # n * ${data.max_prompt_length} + ${data.max_response_length}
@@ -57,94 +39,10 @@ actor_rollout_ref:
       wrap_policy:
         # transformer_layer_cls_to_wrap: None
         min_num_params: 0
-    # log_prob_micro_batch_size: 4 # will be deprecated, use log_prob_micro_batch_size_per_gpu
     log_prob_micro_batch_size_per_gpu: 1
     log_prob_use_dynamic_bsz: ${actor_rollout_ref.actor.use_dynamic_bsz}
     log_prob_max_token_len_per_gpu: ${actor_rollout_ref.actor.ppo_max_token_len_per_gpu}
     ulysses_sequence_parallel_size: ${actor_rollout_ref.actor.ulysses_sequence_parallel_size} # sp size
-  rollout:
-    name: vllm
-    temperature: 1.0
-    top_k: -1 # 0 for hf rollout, -1 for vllm rollout
-    top_p: 1
-    use_fire_sampling: False # https://arxiv.org/abs/2410.21236
-    prompt_length: ${data.max_prompt_length}  # not use for opensource
-    response_length: ${data.max_response_length}
-    # for vllm rollout
-    dtype: bfloat16 # should align with FSDP
-    gpu_memory_utilization: 0.4
-    ignore_eos: False
-    enforce_eager: True
-    free_cache_engine: True
-    load_format: dummy_dtensor
-    tensor_model_parallel_size: 1
-    max_num_batched_tokens: 8192
-    max_model_len: null
-    max_num_seqs: 1024
-    # log_prob_micro_batch_size: 8 # will be deprecated, use log_prob_micro_batch_size_per_gpu
-    log_prob_micro_batch_size_per_gpu: 1
-    log_prob_use_dynamic_bsz: ${actor_rollout_ref.actor.use_dynamic_bsz}
-    log_prob_max_token_len_per_gpu: ${actor_rollout_ref.actor.ppo_max_token_len_per_gpu}
-    disable_log_stats: True
-    enable_chunked_prefill: True # could get higher throughput
-    # for hf rollout
-    do_sample: True
-    # number of responses (i.e. num sample times)
-    n: 8 # should be > 1 for grpo; Currently is unused parameter
-
-critic:
-  strategy: fsdp
-  optim:
-    lr: 1e-5
-    lr_warmup_steps_ratio: 0.  # the total steps will be injected during runtime
-    # min_lr_ratio: null   # only useful for warmup with cosine
-    warmup_style: constant  # select from constant/cosine
-    total_training_steps: -1  # must be override by program
-  model:
-    path: /PATH/TO/MODEL/CHECKPOINT/
-    tokenizer_path: ${actor_rollout_ref.model.path}
-    override_config: { }
-    external_lib: ${actor_rollout_ref.model.external_lib}
-    enable_gradient_checkpointing: True
-    use_remove_padding: False
-    fsdp_config:
-      param_offload: False
-      optimizer_offload: False
-      wrap_policy:
-        # transformer_layer_cls_to_wrap: None
-        min_num_params: 0
-      fsdp_size: -1
-  ppo_mini_batch_size: ${actor_rollout_ref.actor.ppo_mini_batch_size}
-  # ppo_micro_batch_size: 8 # will be deprecated, use ppo_micro_batch_size_per_gpu
-  ppo_micro_batch_size_per_gpu: 1
-  forward_micro_batch_size_per_gpu: ${critic.ppo_micro_batch_size_per_gpu}
-  use_dynamic_bsz: ${actor_rollout_ref.actor.use_dynamic_bsz}
-  ppo_max_token_len_per_gpu: 16384 # (${actor_rollout_ref.actor.ppo_max_token_len_per_gpu}) * 2
-  forward_max_token_len_per_gpu: ${critic.ppo_max_token_len_per_gpu}
-  ulysses_sequence_parallel_size: 1 # sp size
-  ppo_epochs: ${actor_rollout_ref.actor.ppo_epochs}
-  shuffle: ${actor_rollout_ref.actor.shuffle}
-  grad_clip: 1.0
-  cliprange_value: 0.5
-
-reward_model:
-  enable: False
-  strategy: fsdp
-  model:
-    input_tokenizer: ${actor_rollout_ref.model.path}  # set this to null if the chat template is identical
-    path: ~/models/FsfairX-LLaMA3-RM-v0.1
-    external_lib: ${actor_rollout_ref.model.external_lib}
-    use_remove_padding: False
-    fsdp_config:
-      min_num_params: 0
-      param_offload: False
-      fsdp_size: -1
-  # micro_batch_size: null # will be deprecated, use micro_batch_size_per_gpu
-  # micro_batch_size_per_gpu: 2 # set a number
-  # max_length: null
-  ulysses_sequence_parallel_size: 1 # sp size
-  use_dynamic_bsz: ${critic.use_dynamic_bsz}
-  forward_max_token_len_per_gpu: ${critic.forward_max_token_len_per_gpu}
 
 custom_reward_function:
   path: null
@@ -153,7 +51,6 @@ custom_reward_function:
 algorithm:
   gamma: 1.0
   lam: 1.0
-  adv_estimator: grpo
   kl_penalty: kl  # how to estimate kl divergence
   kl_ctrl:
     type: fixed
@@ -161,20 +58,10 @@ algorithm:
 
 trainer:
   balance_batch: True
-  total_epochs: 15
   # total_training_steps: null
-  project_name: WEBSHOP
-  experiment_name: WEBSHOP_RFT
-  logger: [ 'wandb' ]
-  val_generations_to_log_to_wandb: 0
-  nnodes: 1
-  n_gpus_per_node: 2
   # auto: find the last ckpt to resume. If can't find, start from scratch
   resume_mode: auto # or auto or resume_path if
-  test_freq: 100
-  critic_warmup: 0
   default_hdfs_dir: null
   remove_previous_ckpt_in_save: False
   del_local_ckpt_after_load: False
-  default_local_dir: checkpoints/${trainer.project_name}/${trainer.experiment_name}
   val_before_train: False
diff --git a/examples/opmd_gsm8k/train_opmd_gsm8k.yaml b/examples/opmd_gsm8k/train_opmd_gsm8k.yaml
index 88f92fb461..326904d987 100644
--- a/examples/opmd_gsm8k/train_opmd_gsm8k.yaml
+++ b/examples/opmd_gsm8k/train_opmd_gsm8k.yaml
@@ -22,26 +22,9 @@
 #     adv_estimator: grpo  # merely to disable critic model, doesn't affect adv compute when algorithm_type is opmd
 
 
-data:
-  tokenizer: null
-  train_files: /train.jsonl
-  val_files: /test.jsonl
-  prompt_key: prompt
-  max_prompt_length: 256
-  max_response_length: 1024
-  train_batch_size: 256
-  val_batch_size: null
-  return_raw_input_ids: False  # This should be set to true when the tokenizer between policy and rm differs
-  return_raw_chat: False
-  shuffle: True
-  filter_overlong_prompts: False # for large-scale dataset, filtering overlong prompts could be timeconsuming. You should disable this and set `truncation='left'
-  truncation: error
-  image_key: images
-
 actor_rollout_ref:
   hybrid_engine: True
   model:
-    path: path_to_models/Qwen2.5-1.5B-Inst
     external_lib: null
     override_config: { }
     enable_gradient_checkpointing: True
@@ -49,7 +32,6 @@ actor_rollout_ref:
   actor:
     strategy: fsdp  # This is for backward-compatibility
     ppo_mini_batch_size: 128
-    # ppo_micro_batch_size: 8 # will be deprecated, use ppo_micro_batch_size_per_gpu
     ppo_micro_batch_size_per_gpu: 4
     use_dynamic_bsz: True
     ppo_max_token_len_per_gpu: 16384 # n * ${data.max_prompt_length} + ${data.max_response_length}
@@ -87,94 +69,10 @@ actor_rollout_ref:
       wrap_policy:
         # transformer_layer_cls_to_wrap: None
         min_num_params: 0
-    # log_prob_micro_batch_size: 4 # will be deprecated, use log_prob_micro_batch_size_per_gpu
     log_prob_micro_batch_size_per_gpu: 16
     log_prob_use_dynamic_bsz: ${actor_rollout_ref.actor.use_dynamic_bsz}
     log_prob_max_token_len_per_gpu: ${actor_rollout_ref.actor.ppo_max_token_len_per_gpu}
     ulysses_sequence_parallel_size: ${actor_rollout_ref.actor.ulysses_sequence_parallel_size} # sp size
-  rollout:
-    name: vllm
-    temperature: 1.0
-    top_k: -1 # 0 for hf rollout, -1 for vllm rollout
-    top_p: 1
-    use_fire_sampling: False # https://arxiv.org/abs/2410.21236
-    prompt_length: ${data.max_prompt_length}  # not use for opensource
-    response_length: ${data.max_response_length}
-    # for vllm rollout
-    dtype: bfloat16 # should align with FSDP
-    gpu_memory_utilization: 0.4
-    ignore_eos: False
-    enforce_eager: True
-    free_cache_engine: True
-    load_format: dummy_dtensor
-    tensor_model_parallel_size: 2
-    max_num_batched_tokens: 8192
-    max_model_len: null
-    max_num_seqs: 1024
-    # log_prob_micro_batch_size: 8 # will be deprecated, use log_prob_micro_batch_size_per_gpu
-    log_prob_micro_batch_size_per_gpu: 4
-    log_prob_use_dynamic_bsz: ${actor_rollout_ref.actor.use_dynamic_bsz}
-    log_prob_max_token_len_per_gpu: ${actor_rollout_ref.actor.ppo_max_token_len_per_gpu}
-    disable_log_stats: True
-    enable_chunked_prefill: True # could get higher throughput
-    # for hf rollout
-    do_sample: True
-    # number of responses (i.e. num sample times)
-    n: 8 # > 1 for grpo
-
-critic:
-  strategy: fsdp
-  optim:
-    lr: 1e-5
-    lr_warmup_steps_ratio: 0.  # the total steps will be injected during runtime
-    # min_lr_ratio: null   # only useful for warmup with cosine
-    warmup_style: constant  # select from constant/cosine
-    total_training_steps: -1  # must be override by program
-  model:
-    path: path_to_models/Qwen2.5-1.5B-Inst
-    tokenizer_path: ${actor_rollout_ref.model.path}
-    override_config: { }
-    external_lib: ${actor_rollout_ref.model.external_lib}
-    enable_gradient_checkpointing: True
-    use_remove_padding: False
-    fsdp_config:
-      param_offload: False
-      optimizer_offload: False
-      wrap_policy:
-        # transformer_layer_cls_to_wrap: None
-        min_num_params: 0
-      fsdp_size: -1
-  ppo_mini_batch_size: ${actor_rollout_ref.actor.ppo_mini_batch_size}
-  # ppo_micro_batch_size: 8 # will be deprecated, use ppo_micro_batch_size_per_gpu
-  ppo_micro_batch_size_per_gpu: 64
-  forward_micro_batch_size_per_gpu: ${critic.ppo_micro_batch_size_per_gpu}
-  use_dynamic_bsz: ${actor_rollout_ref.actor.use_dynamic_bsz}
-  ppo_max_token_len_per_gpu: 32768 # (${actor_rollout_ref.actor.ppo_max_token_len_per_gpu}) * 2
-  forward_max_token_len_per_gpu: ${critic.ppo_max_token_len_per_gpu}
-  ulysses_sequence_parallel_size: 1 # sp size
-  ppo_epochs: ${actor_rollout_ref.actor.ppo_epochs}
-  shuffle: ${actor_rollout_ref.actor.shuffle}
-  grad_clip: 1.0
-  cliprange_value: 0.5
-
-reward_model:
-  enable: False
-  strategy: fsdp
-  model:
-    input_tokenizer: ${actor_rollout_ref.model.path}  # set this to null if the chat template is identical
-    path: ~/models/FsfairX-LLaMA3-RM-v0.1
-    external_lib: ${actor_rollout_ref.model.external_lib}
-    use_remove_padding: False
-    fsdp_config:
-      min_num_params: 0
-      param_offload: False
-      fsdp_size: -1
-  # micro_batch_size: null # will be deprecated, use micro_batch_size_per_gpu
-  # micro_batch_size_per_gpu: 2 # set a number
-  # max_length: null
-  ulysses_sequence_parallel_size: 1 # sp size
-  use_dynamic_bsz: ${critic.use_dynamic_bsz}
-  forward_max_token_len_per_gpu: ${critic.forward_max_token_len_per_gpu}
 
 custom_reward_function:
   path: null
@@ -183,7 +81,6 @@ custom_reward_function:
 algorithm:
   gamma: 1.0
   lam: 1.0
-  adv_estimator: grpo
   kl_penalty: kl  # how to estimate kl divergence
   kl_ctrl:
     type: fixed
@@ -191,22 +88,12 @@ algorithm:
 
 trainer:
   balance_batch: True
-  total_epochs: 10
   # total_training_steps: null
-  project_name: Trinity-RFT-gsm8k-test-opmd
-  experiment_name: qwen2.5-1.5B-gsm8k-opmd-kl_0.001-entropy_0-tau_4-beta1_0.0-beta2_0.95-lr_2e-6-sync10
-  logger: [ 'console','wandb' ]
-  val_generations_to_log_to_wandb: 0
-  nnodes: 1
-  n_gpus_per_node: 2
   # auto: find the last ckpt to resume. If can't find, start from scratch
   resume_mode: auto # or auto or resume_path if
-  test_freq: 100
-  critic_warmup: 0
   default_hdfs_dir: null
   remove_previous_ckpt_in_save: False
   del_local_ckpt_after_load: False
-  default_local_dir: checkpoints/${trainer.project_name}/${trainer.experiment_name}
   val_before_train: False
   max_actor_ckpt_to_keep: 5
   max_critic_ckpt_to_keep: 5
diff --git a/examples/ppo_countdown/train_countdown.yaml b/examples/ppo_countdown/train_countdown.yaml
index 291afe452f..ae16122ef7 100644
--- a/examples/ppo_countdown/train_countdown.yaml
+++ b/examples/ppo_countdown/train_countdown.yaml
@@ -1,23 +1,6 @@
-data:
-  tokenizer: null
-  train_files: train_example.parquet
-  val_files: test_example.parquet
-  prompt_key: prompt
-  max_prompt_length: 256
-  max_response_length: 1024
-  train_batch_size: 256
-  val_batch_size: null
-  return_raw_input_ids: False  # This should be set to true when the tokenizer between policy and rm differs
-  return_raw_chat: False
-  shuffle: True
-  filter_overlong_prompts: False # for large-scale dataset, filtering overlong prompts could be timeconsuming. You should disable this and set `truncation='left'
-  truncation: error
-  image_key: images
-
 actor_rollout_ref:
   hybrid_engine: True
   model:
-    path: /PATH/TO/MODEL/CHECKPOINT/
     external_lib: null
     override_config: { }
     enable_gradient_checkpointing: True
@@ -25,7 +8,6 @@ actor_rollout_ref:
   actor:
     strategy: fsdp  # This is for backward-compatibility
     ppo_mini_batch_size: 128
-    # ppo_micro_batch_size: 8 # will be deprecated, use ppo_micro_batch_size_per_gpu
     ppo_micro_batch_size_per_gpu: 4
     use_dynamic_bsz: True
     ppo_max_token_len_per_gpu: 16384 # n * ${data.max_prompt_length} + ${data.max_response_length}
@@ -63,40 +45,10 @@ actor_rollout_ref:
       wrap_policy:
         # transformer_layer_cls_to_wrap: None
         min_num_params: 0
-    # log_prob_micro_batch_size: 4 # will be deprecated, use log_prob_micro_batch_size_per_gpu
     log_prob_micro_batch_size_per_gpu: 8
     log_prob_use_dynamic_bsz: ${actor_rollout_ref.actor.use_dynamic_bsz}
     log_prob_max_token_len_per_gpu: ${actor_rollout_ref.actor.ppo_max_token_len_per_gpu}
     ulysses_sequence_parallel_size: ${actor_rollout_ref.actor.ulysses_sequence_parallel_size} # sp size
-  rollout:
-    name: vllm
-    temperature: 1.0
-    top_k: -1 # 0 for hf rollout, -1 for vllm rollout
-    top_p: 1
-    use_fire_sampling: False # https://arxiv.org/abs/2410.21236
-    prompt_length: ${data.max_prompt_length}  # not use for opensource
-    response_length: ${data.max_response_length}
-    # for vllm rollout
-    dtype: bfloat16 # should align with FSDP
-    gpu_memory_utilization: 0.4
-    ignore_eos: False
-    enforce_eager: True
-    free_cache_engine: True
-    load_format: dummy_dtensor
-    tensor_model_parallel_size: 2
-    max_num_batched_tokens: 8192
-    max_model_len: null
-    max_num_seqs: 1024
-    # log_prob_micro_batch_size: 8 # will be deprecated, use log_prob_micro_batch_size_per_gpu
-    log_prob_micro_batch_size_per_gpu: 4
-    log_prob_use_dynamic_bsz: ${actor_rollout_ref.actor.use_dynamic_bsz}
-    log_prob_max_token_len_per_gpu: ${actor_rollout_ref.actor.ppo_max_token_len_per_gpu}
-    disable_log_stats: True
-    enable_chunked_prefill: True # could get higher throughput
-    # for hf rollout
-    do_sample: True
-    # number of responses (i.e. num sample times)
-    n: 1 # > 1 for grpo
 
 critic:
   strategy: fsdp
@@ -107,8 +59,6 @@ critic:
     warmup_style: constant  # select from constant/cosine
     total_training_steps: -1  # must be override by program
   model:
-    path: /PATH/TO/MODEL/CHECKPOINT/
-    tokenizer_path: ${actor_rollout_ref.model.path}
     override_config: { }
     external_lib: ${actor_rollout_ref.model.external_lib}
     enable_gradient_checkpointing: True
@@ -121,7 +71,6 @@ critic:
         min_num_params: 0
       fsdp_size: -1
   ppo_mini_batch_size: ${actor_rollout_ref.actor.ppo_mini_batch_size}
-  # ppo_micro_batch_size: 8 # will be deprecated, use ppo_micro_batch_size_per_gpu
   ppo_micro_batch_size_per_gpu: 8
   forward_micro_batch_size_per_gpu: ${critic.ppo_micro_batch_size_per_gpu}
   use_dynamic_bsz: ${actor_rollout_ref.actor.use_dynamic_bsz}
@@ -133,25 +82,6 @@ critic:
   grad_clip: 1.0
   cliprange_value: 0.5
 
-reward_model:
-  enable: False
-  strategy: fsdp
-  model:
-    input_tokenizer: ${actor_rollout_ref.model.path}  # set this to null if the chat template is identical
-    path: ~/models/FsfairX-LLaMA3-RM-v0.1
-    external_lib: ${actor_rollout_ref.model.external_lib}
-    use_remove_padding: False
-    fsdp_config:
-      min_num_params: 0
-      param_offload: False
-      fsdp_size: -1
-  # micro_batch_size: null # will be deprecated, use micro_batch_size_per_gpu
-  # micro_batch_size_per_gpu: 2 # set a number
-  # max_length: null
-  ulysses_sequence_parallel_size: 1 # sp size
-  use_dynamic_bsz: ${critic.use_dynamic_bsz}
-  forward_max_token_len_per_gpu: ${critic.forward_max_token_len_per_gpu}
-
 custom_reward_function:
   path: null
   name: compute_score
@@ -159,7 +89,6 @@ custom_reward_function:
 algorithm:
   gamma: 1.0
   lam: 1.0
-  adv_estimator: gae
   kl_penalty: kl  # how to estimate kl divergence
   kl_ctrl:
     type: fixed
@@ -167,22 +96,13 @@ algorithm:
 
 trainer:
   balance_batch: True
-  total_epochs: 15
   # total_training_steps: null
-  project_name: TinyZero
-  experiment_name: trinity-qwen2.5-1.5b
-  logger: [ 'wandb' ]
-  val_generations_to_log_to_wandb: 0
-  nnodes: 1
-  n_gpus_per_node: 2
   # auto: find the last ckpt to resume. If can't find, start from scratch
   resume_mode: auto # or auto or resume_path if
-  test_freq: 100
   critic_warmup: 0
   default_hdfs_dir: null
   remove_previous_ckpt_in_save: False
   del_local_ckpt_after_load: False
-  default_local_dir: checkpoints/${trainer.project_name}/${trainer.experiment_name}
   val_before_train: False
   max_actor_ckpt_to_keep: 5
   max_critic_ckpt_to_keep: 5
diff --git a/tests/common/config_test.py b/tests/common/config_test.py
index 35b9a4f9c7..e1ac0aa7d4 100644
--- a/tests/common/config_test.py
+++ b/tests/common/config_test.py
@@ -46,7 +46,8 @@ def test_all_examples_are_valid(self):
                     print(f"Checking config: {filename}")
                     config_path = os.path.join(example_dir, example_name, filename)
                     try:
-                        load_config(config_path)
+                        config = load_config(config_path)
+                        config.check_and_update()
                     except Exception as e:
                         print(f"Error loading config {config_path}: {e}")
                         raise e
diff --git a/tests/template/verl_config.yaml b/tests/template/verl_config.yaml
index d1e84cb455..b17fc87958 100644
--- a/tests/template/verl_config.yaml
+++ b/tests/template/verl_config.yaml
@@ -8,7 +8,6 @@ actor_rollout_ref:
   actor:
     strategy: fsdp  # This is for backward-compatibility
     ppo_mini_batch_size: 4
-    # ppo_micro_batch_size: 8 # will be deprecated, use ppo_micro_batch_size_per_gpu
     ppo_micro_batch_size_per_gpu: 1
     use_dynamic_bsz: True
     ppo_max_token_len_per_gpu: 16384 # n * ${data.max_prompt_length} + ${data.max_response_length}
@@ -46,37 +45,10 @@ actor_rollout_ref:
       wrap_policy:
         # transformer_layer_cls_to_wrap: None
         min_num_params: 0
-    # log_prob_micro_batch_size: 4 # will be deprecated, use log_prob_micro_batch_size_per_gpu
     log_prob_micro_batch_size_per_gpu: 1
     log_prob_use_dynamic_bsz: ${actor_rollout_ref.actor.use_dynamic_bsz}
     log_prob_max_token_len_per_gpu: ${actor_rollout_ref.actor.ppo_max_token_len_per_gpu}
     ulysses_sequence_parallel_size: ${actor_rollout_ref.actor.ulysses_sequence_parallel_size} # sp size
-  rollout:
-    name: vllm
-    use_fire_sampling: False # https://arxiv.org/abs/2410.21236
-    prompt_length: ${data.max_prompt_length}  # not use for opensource
-    response_length: ${data.max_response_length}
-    # for vllm rollout
-    dtype: bfloat16 # should align with FSDP
-    gpu_memory_utilization: 0.4
-    ignore_eos: False
-    enforce_eager: True
-    free_cache_engine: True
-    load_format: dummy_dtensor
-    tensor_model_parallel_size: 2
-    max_num_batched_tokens: 8192
-    max_model_len: null
-    max_num_seqs: 1024
-    # log_prob_micro_batch_size: 8 # will be deprecated, use log_prob_micro_batch_size_per_gpu
-    log_prob_micro_batch_size_per_gpu: 1
-    log_prob_use_dynamic_bsz: ${actor_rollout_ref.actor.use_dynamic_bsz}
-    log_prob_max_token_len_per_gpu: ${actor_rollout_ref.actor.ppo_max_token_len_per_gpu}
-    disable_log_stats: True
-    enable_chunked_prefill: True # could get higher throughput
-    # for hf rollout
-    do_sample: True
-    # number of responses (i.e. num sample times)
-    n: 1 # > 1 for grpo
 
 critic:
   strategy: fsdp
@@ -87,7 +59,6 @@ critic:
     warmup_style: constant  # select from constant/cosine
     total_training_steps: -1  # must be override by program
   model:
-    tokenizer_path: ${actor_rollout_ref.model.path}
     override_config: { }
     external_lib: ${actor_rollout_ref.model.external_lib}
     enable_gradient_checkpointing: True
@@ -100,7 +71,6 @@ critic:
         min_num_params: 0
       fsdp_size: -1
   ppo_mini_batch_size: ${actor_rollout_ref.actor.ppo_mini_batch_size}
-  # ppo_micro_batch_size: 8 # will be deprecated, use ppo_micro_batch_size_per_gpu
   ppo_micro_batch_size_per_gpu: 1
   forward_micro_batch_size_per_gpu: ${critic.ppo_micro_batch_size_per_gpu}
   use_dynamic_bsz: ${actor_rollout_ref.actor.use_dynamic_bsz}
@@ -115,7 +85,6 @@ critic:
 algorithm:
   gamma: 1.0
   lam: 1.0
-  adv_estimator: gae
   kl_penalty: kl  # how to estimate kl divergence
   kl_ctrl:
     type: fixed
@@ -123,22 +92,13 @@ algorithm:
 
 trainer:
   balance_batch: True
-  total_epochs: 10
   # total_training_steps: null
-  project_name: TinyZero
-  experiment_name: trinity-qwen2.5-1.5b
-  logger: [ 'wandb' ]
-  val_generations_to_log_to_wandb: 0
-  nnodes: 1
-  n_gpus_per_node: 2
-  save_freq: 20
   # auto: find the last ckpt to resume. If can't find, start from scratch
   resume_mode: auto # or auto or resume_path if
   critic_warmup: 0
   default_hdfs_dir: null
   remove_previous_ckpt_in_save: False
   del_local_ckpt_after_load: False
-  default_local_dir: checkpoints/${trainer.project_name}/${trainer.experiment_name}
   val_before_train: False
   max_actor_ckpt_to_keep: 1
   max_critic_ckpt_to_keep: 1
diff --git a/trinity/common/config.py b/trinity/common/config.py
index b2703d4d2d..e0660ab03a 100644
--- a/trinity/common/config.py
+++ b/trinity/common/config.py
@@ -173,8 +173,8 @@ class AlgorithmConfig:
     algorithm_type: AlgorithmType = AlgorithmType.PPO
     # for GRPO-like algorithms, repeat each task for `repeat_times` times
     repeat_times: int = 1
-    gamma: float = 1.0
-    lam: float = 1.0
+    gamma: Optional[float] = None
+    lam: Optional[float] = None
     # TODO: add more algorithm params here
 
 
@@ -259,19 +259,20 @@ class ExplorerConfig:
 @dataclass
 class TrainerConfig:
     trainer_type: str = "verl"
-    trainer_config_path: str = ""
     save_interval: int = 0
     enable_preview: bool = True  # enable rollout preview in wandb
 
     # trainer configs
-    actor_use_kl_loss: bool = False
-    actor_kl_loss_coef: float = 0.001
-    actor_entropy_coef: float = 0.001
-    actor_grad_clip: float = 1.0
-    actor_clip_ratio: float = 0.2
+    actor_use_kl_loss: Optional[bool] = None
+    actor_kl_loss_coef: Optional[float] = None
+    actor_entropy_coef: Optional[float] = None
+    actor_grad_clip: Optional[float] = None
+    actor_clip_ratio: Optional[float] = None
     # TODO: extract more train-related params from underlying trainer engine
 
+    # Only one needs to be set for `trainer_config` and `trainer_config_path`
     trainer_config: Any = field(default_factory=dict)
+    trainer_config_path: str = ""
 
 
 @dataclass
@@ -292,7 +293,7 @@ class SynchronizerConfig:
     sync_interval: int = 1
     # waiting for `sync_timeout` seconds before timeout in `nccl` method
     sync_timeout: int = 1200
-    # wait for the lastest checkpoint to be ready
+    # wait for the lastest checkpoint to be ready  # TODO: to be used
     wait_for_checkpoint: bool = False
 
     # ! DO NOT SET, automatically calculated
@@ -338,7 +339,7 @@ def _check_interval(self) -> None:
             and self.algorithm.algorithm_type != AlgorithmType.DPO
             and self.explorer.eval_interval % self.synchronizer.sync_interval != 0
         ):
-            self.buffer.eval_interval = (
+            self.explorer.eval_interval = (
                 max(self.explorer.eval_interval // self.synchronizer.sync_interval, 1)
             ) * self.synchronizer.sync_interval
             logger.warning(
diff --git a/trinity/common/verl_config.py b/trinity/common/verl_config.py
index dd896a23f1..e5d0d9d55f 100644
--- a/trinity/common/verl_config.py
+++ b/trinity/common/verl_config.py
@@ -1,3 +1,4 @@
+import math
 from dataclasses import dataclass, field
 from typing import Any, Dict, List, Optional
 
@@ -13,20 +14,7 @@
 
 @dataclass
 class Data:
-    tokenizer: Optional[str] = None
-    train_files: str = ""
-    val_files: str = ""
-    prompt_key: str = "prompt"
-    max_prompt_length: int = 512
-    max_response_length: int = 512
     train_batch_size: int = 1024
-    val_batch_size: Optional[int] = None
-    return_raw_input_ids: bool = False
-    return_raw_chat: bool = False
-    shuffle: bool = True
-    filter_overlong_prompts: bool = False
-    truncation: str = "error"
-    image_key: str = "images"
 
 
 @dataclass
@@ -109,30 +97,7 @@ class Ref:
 
 @dataclass
 class Rollout:
-    name: str = "vllm"
     temperature: float = 1.0
-    top_k: int = -1
-    top_p: float = 1.0
-    use_fire_sampling: bool = False
-    prompt_length: int = 0
-    response_length: int = 0
-    dtype: str = "bfloat16"
-    gpu_memory_utilization: float = 0.5
-    ignore_eos: bool = False
-    enforce_eager: bool = True
-    free_cache_engine: bool = True
-    load_format: str = "dummy_dtensor"
-    tensor_model_parallel_size: int = 2
-    max_num_batched_tokens: int = 8192
-    max_model_len: Optional[int] = None
-    max_num_seqs: int = 1024
-    log_prob_micro_batch_size: Optional[int] = None
-    log_prob_micro_batch_size_per_gpu: int = 1
-    log_prob_use_dynamic_bsz: bool = False
-    log_prob_max_token_len_per_gpu: int = 0
-    disable_log_stats: bool = True
-    enable_chunked_prefill: bool = True
-    do_sample: bool = True
     n: int = 1  # > 1 for grpo
 
 
@@ -268,7 +233,7 @@ class veRLConfig:
     synchronizer: Optional[SynchronizerConfig] = None
     enable_preview: bool = True
 
-    def synchronize_config(self, config: Config) -> None:
+    def synchronize_config(self, config: Config) -> None:  # noqa: C901
         """Synchronize config."""
         if config.mode != "train":
             rollout_gpu_num = (
@@ -283,36 +248,50 @@ def synchronize_config(self, config: Config) -> None:
             )
         else:
             rollout_gpu_num = 0
-        rollout_node_num = rollout_gpu_num // config.cluster.gpu_per_node
-        self.trainer.nnodes = config.cluster.node_num - rollout_node_num
-        self.actor_rollout_ref.model.path = config.model.model_path
-        self.critic.model.path = config.model.critic_model_path
-        self.critic.model.tokenizer_path = config.model.critic_model_path
 
         if config.cluster.node_num == 1:
             # for single node scenarios, rollout and training are on the same node
+            self.trainer.nnodes = config.cluster.node_num
             self.trainer.n_gpus_per_node = config.cluster.gpu_per_node - rollout_gpu_num
         else:
             # for multi-node scenarios, some nodes for rollout, others for training
+            assert (
+                rollout_gpu_num % config.cluster.gpu_per_node == 0
+            ), "rollout_gpu_num must be divisible by `gpu_per_node`"
+            rollout_node_num = math.ceil(rollout_gpu_num / config.cluster.gpu_per_node)
+            self.trainer.nnodes = config.cluster.node_num - rollout_node_num
+            if self.trainer.nnodes < 1:
+                raise ValueError("The number of training nodes must be greater than 0")
             self.trainer.n_gpus_per_node = config.cluster.gpu_per_node
-        self.trainer.sync_freq = config.synchronizer.sync_interval
-        self.trainer.save_freq = config.trainer.save_interval
-        self.synchronizer = config.synchronizer
-        self.actor_rollout_ref.synchronizer = config.synchronizer
-        self.buffer = config.buffer
+
         world_size = self.trainer.nnodes * self.trainer.n_gpus_per_node
         if config.buffer.batch_size % world_size != 0:
             raise ValueError(
                 f"batch_size ({config.buffer.batch_size}) must be divisible by ({world_size})"
             )
-        # TODO: use dynamic read_batch_size to support multi-round scenarios
-        # Get the experiences of one explore step
+
+        self.trainer.sync_freq = config.synchronizer.sync_interval
+        self.trainer.save_freq = config.trainer.save_interval
         self.trainer.project_name = config.project
         self.trainer.experiment_name = config.name
-        self.data.train_batch_size = config.buffer.batch_size
         self.trainer.default_local_dir = config.checkpoint_job_dir
         self.trainer.sft_warmup_steps = config.buffer.trainer_input.sft_warmup_steps
-        self.actor_rollout_ref.actor.ppo_mini_batch_size = config.buffer.batch_size
+
+        self.buffer = config.buffer
+        # TODO: use dynamic read_batch_size to support multi-round scenarios
+        # Get the experiences of one explore step
+        self.data.train_batch_size = config.buffer.batch_size
+
+        self.synchronizer = config.synchronizer
+        self.actor_rollout_ref.synchronizer = config.synchronizer
+
+        # Actor / Critic config
+        self.actor_rollout_ref.model.path = config.model.model_path
+        self.critic.model.path = config.model.critic_model_path
+        self.critic.model.tokenizer_path = config.model.critic_model_path
+        self.actor_rollout_ref.actor.ppo_mini_batch_size = (
+            config.buffer.batch_size
+        )  # TODO: may allow user to change
         self.actor_rollout_ref.rollout.temperature = (
             config.buffer.explorer_input.taskset.rollout_args.temperature
         )
@@ -320,6 +299,22 @@ def synchronize_config(self, config: Config) -> None:
         self.critic.ppo_mini_batch_size = config.buffer.batch_size
         self.critic.rollout_n = self.actor_rollout_ref.rollout.n
 
+        if config.trainer.actor_use_kl_loss is not None:
+            self.actor_rollout_ref.actor.use_kl_loss = config.trainer.actor_use_kl_loss
+        if config.trainer.actor_kl_loss_coef is not None:
+            self.actor_rollout_ref.actor.kl_loss_coef = config.trainer.actor_kl_loss_coef
+        if config.trainer.actor_entropy_coef is not None:
+            self.actor_rollout_ref.actor.entropy_coeff = config.trainer.actor_entropy_coef
+        if config.trainer.actor_grad_clip is not None:
+            self.actor_rollout_ref.actor.grad_clip = config.trainer.actor_grad_clip
+        if config.trainer.actor_clip_ratio is not None:
+            self.actor_rollout_ref.actor.clip_ratio = config.trainer.actor_clip_ratio
+
+        # Algorithm related config
+        if config.algorithm.gamma is not None:
+            self.algorithm.gamma = config.algorithm.gamma
+        if config.algorithm.lam is not None:
+            self.algorithm.lam = config.algorithm.lam
         self.actor_rollout_ref.actor.algorithm_type = config.algorithm.algorithm_type
         if config.algorithm.algorithm_type == AlgorithmType.PPO:
             logger.info("Using GAE `adv_estimator` for PPO")
@@ -328,15 +323,6 @@ def synchronize_config(self, config: Config) -> None:
             logger.info("Using GRPO `adv_estimator` for GRPO")
             self.algorithm.adv_estimator = AdvantageEstimator.GRPO.value
 
-        # copy trainer related config from global config
-        self.algorithm.gamma = config.algorithm.gamma
-        self.algorithm.lam = config.algorithm.lam
-        self.actor_rollout_ref.actor.use_kl_loss = config.trainer.actor_use_kl_loss
-        self.actor_rollout_ref.actor.kl_loss_coef = config.trainer.actor_kl_loss_coef
-        self.actor_rollout_ref.actor.entropy_coeff = config.trainer.actor_entropy_coef
-        self.actor_rollout_ref.actor.grad_clip = config.trainer.actor_grad_clip
-        self.actor_rollout_ref.actor.clip_ratio = config.trainer.actor_clip_ratio
-
         if self.actor_rollout_ref.actor.algorithm_type.is_dpo():  # for DPO
             if not self.actor_rollout_ref.actor.use_kl_loss:
                 self.actor_rollout_ref.actor.use_kl_loss = True
diff --git a/trinity/common/workflows/workflow.py b/trinity/common/workflows/workflow.py
index 1a0daadb2b..9786bd6b77 100644
--- a/trinity/common/workflows/workflow.py
+++ b/trinity/common/workflows/workflow.py
@@ -153,12 +153,12 @@ def __init__(
         task: Task,
         auxiliary_models: Optional[List[openai.OpenAI]] = None,
     ):
+        self.reset(task)
         super().__init__(
             model=model,
             task=task,
             auxiliary_models=auxiliary_models,
         )
-        self.reset(task)
 
     @property
     def resettable(self):
@@ -226,14 +226,12 @@ def __init__(
         task: Task,
         auxiliary_models: Optional[List[openai.OpenAI]] = None,
     ):
-        if task.reward_fn is None:
-            task.reward_fn = MathRewardFn
-        if task.reward_fn == MathRewardFn and task.format_args.system_prompt is None:
-            task.format_args.system_prompt = """A conversation between User and Assistant. The user asks a question, and the Assistant solves it. The assistant first thinks about the reasoning process in the mind and then provides the user with the answer. The reasoning process and answer are enclosed within <think> </think> and <answer> </answer> tags, respectively, i.e.,
-<think> reasoning process here </think>
-<answer> answer here </answer>.
-"""
-        super().__init__(model=model, task=task, auxiliary_models=auxiliary_models)
+        self.reset(task)
+        super().__init__(
+            model=model,
+            task=task,
+            auxiliary_models=auxiliary_models,
+        )
 
     def reset(self, task: Task):
         if task.reward_fn is None:
diff --git a/trinity/manager/config_manager.py b/trinity/manager/config_manager.py
index 21b0e57348..9ac2d36f16 100644
--- a/trinity/manager/config_manager.py
+++ b/trinity/manager/config_manager.py
@@ -50,29 +50,41 @@ def _init_default_config(self):
             "mode": "both",
             "project": "Trinity-RFT",
             "exp_name": "qwen2.5-1.5B",
+            "checkpoint_root_dir": "",
             "monitor_type": MonitorType.TENSORBOARD.value,
+            # Algorithm Configs
+            "algorithm_type": AlgorithmType.PPO.value,
+            "_grouped_adv_repeat_times": 2,
+            "_not_grouped_adv_repeat_times": 1,
+            "repeat_times": 1,
+            "gamma": 1.0,
+            "lam": 1.0,
             # Model Configs
             "model_path": "",
             "critic_model_path": "",
-            "checkpoint_path": "",
+            "max_prompt_tokens": 1024,
+            "max_response_tokens": 1024,
+            # Cluster Config
             "node_num": 1,
             "gpu_per_node": 8,
             "total_gpu_num": 8,
             "trainer_gpu_num": 6,
-            "max_prompt_tokens": 1024,
-            "max_response_tokens": 1024,
-            # Global Configs
+            # Buffer Configs
             "total_epochs": 20,
             "_train_batch_size_per_gpu": 16,
             "train_batch_size": 96,
-            "eval_interval": 1000,
-            "algorithm_type": AlgorithmType.PPO.value,
+            "buffer_max_retry_times": 3,
+            "max_retry_interval": 1,
             # Taskset Configs
             "taskset_path": "",
             "taskset_subset_name": None,
             "taskset_split": "train",
             "taskset_prompt_key": "question",
             "taskset_response_key": "answer",
+            "temperature": 1.0,
+            "top_p": 1.0,  # TODO: to be used
+            "top_k": -1,  # TODO: to be used
+            "logprobs": 0,
             # Eval Taskset Configs
             "_eval_tasksets_num": 0,
             # Explorer Input Configs
@@ -80,15 +92,13 @@ def _init_default_config(self):
             "default_reward_fn_type": "math_reward",
             "system_prompt": None,
             "reply_prefix": None,
-            # Experience Buffer Configs
+            # Experience Buffer / DPO Dataset Configs
             "_dpo_storage_type": StorageType.FILE.value,
             "_not_dpo_storage_type": StorageType.QUEUE.value,
             "storage_type": StorageType.QUEUE.value,
             "_dpo_experience_buffer_path": "",
             "_not_dpo_experience_buffer_path": "",
             "experience_buffer_path": "",
-            "buffer_max_retry_times": 3,
-            "max_retry_interval": 1,
             "dpo_dataset_train_split": "train",
             "dpo_dataset_prompt_type": PromptType.MESSAGES.value,
             "dpo_dataset_prompt_key": "prompt",
@@ -101,26 +111,32 @@ def _init_default_config(self):
             "sft_warmup_messages_key": "messages",
             "sft_warmup_prompt_key": "prompt",
             "sft_warmup_response_key": "response",
+            # TrainerInput Configs
+            # TODO: read_experience_strategy
+            "sft_warmup_steps": 0,
             # Explorer and Sync Configs
+            "runner_num": 32,
+            "max_timeout": 900,
+            "explorer_max_retry_times": 2,
+            "eval_interval": 1000,
+            "eval_on_latest_checkpoint": True,
+            # Rollout Model Configs
             "engine_type": "vllm_async",
             "engine_num": 2,
-            "runner_num": 32,
-            "_grouped_adv_repeat_times": 2,
-            "_not_grouped_adv_repeat_times": 1,
-            "repeat_times": 1,
             "tensor_parallel_size": 1,
-            "enable_prefix_caching": False,
+            "use_v1": True,
             "enforce_eager": True,
+            "enable_prefix_caching": False,
+            "enable_chunked_prefill": False,
+            "gpu_memory_utilization": 0.9,
             "dtype": "bfloat16",
-            "temperature": 1.0,
-            "top_p": 1.0,
-            "top_k": -1,
             "seed": 42,
-            "logprobs": 0,
-            "gpu_memory_utilization": 0.9,
-            "enable_chunked_prefill": False,
-            "max_timeout": 900,
-            "explorer_max_retry_times": 2,
+            # TODO: max_prompt_tokens
+            # TODO: max_response_tokens
+            # TODO: chat_template
+            "enable_thinking": False,
+            "enable_openai_api": False,
+            # TODO: Auxiliary Models Configs
             # Synchronizer Configs
             "_not_dpo_sync_method": SyncMethod.NCCL.value,
             "sync_method": SyncMethod.NCCL.value,
@@ -128,9 +144,15 @@ def _init_default_config(self):
             "sync_timeout": 1200,
             # Trainer Configs
             "trainer_type": "verl",
-            "sft_warmup_steps": 0,
             "_nccl_save_interval": 100,
             "save_interval": 100,
+            # TODO: enable_preview
+            "_not_dpo_actor_use_kl_loss": True,
+            "actor_use_kl_loss": True,
+            "actor_kl_loss_coef": 0.001,
+            "actor_entropy_coef": 0.001,
+            "actor_grad_clip": 1.0,
+            "actor_clip_ratio": 0.2,
             # veRL Trainer Configs
             "training_args": [
                 "balance_batch",
@@ -151,8 +173,6 @@ def _init_default_config(self):
             "del_local_ckpt_after_load": False,
             "max_actor_ckpt_to_keep": None,
             "max_critic_ckpt_to_keep": None,
-            "gamma": 1.0,
-            "lam": 1.0,
             "adv_estimator": "gae",
             "norm_adv_by_std_in_grpo": True,
             "use_kl_in_reward": False,
@@ -170,12 +190,6 @@ def _init_default_config(self):
             "actor_tau": 0.0,
             "actor_opmd_baseline": "mean",
             "actor_use_uid": False,
-            "actor_grad_clip": 1.0,
-            "actor_clip_ratio": 0.2,
-            "actor_entropy_coef": 0.001,
-            "_not_dpo_actor_use_kl_loss": True,
-            "actor_use_kl_loss": True,
-            "actor_kl_loss_coef": 0.001,
             "actor_kl_loss_type": "low_var_kl",
             "actor_checkpoint": ["model", "hf_model", "optimizer", "extra"],
             "critic_lr": 1e-6,
@@ -204,7 +218,7 @@ def maintain_session_state(self):
     def _set_project(self):
         st.text_input("Project", key="project")
 
-    def _set_name(self):
+    def _set_exp_name(self):
         st.text_input("Experiment Name", key="exp_name")
 
     def _set_monitor_type(self):
@@ -221,18 +235,19 @@ def _set_model_path(self):
             st.warning("Please input model path.")
 
     def _set_critic_model_path(self):
-        st.text_input(
-            "Critic Model Path (defaults to `model_path`)",
-            key="critic_model_path",
-        )
+        if st.session_state["adv_estimator"] == AdvantageEstimator.GAE.value:
+            st.text_input(
+                "Critic Model Path (defaults to `model_path`)",
+                key="critic_model_path",
+            )
 
-    def _set_checkpoint_path(self):
-        st.text_input("Checkpoint Path", key="checkpoint_path")
-        if not st.session_state["checkpoint_path"].strip():  # TODO: may auto generate
-            self.unfinished_fields.add("checkpoint_path")
-            st.warning("Please input checkpoint path.")
-        elif not os.path.isabs(st.session_state["checkpoint_path"].strip()):
-            self.unfinished_fields.add("checkpoint_path")
+    def _set_checkpoint_root_dir(self):
+        st.text_input("Checkpoint Root Dir", key="checkpoint_root_dir")
+        if not st.session_state["checkpoint_root_dir"].strip():  # TODO: may auto generate
+            self.unfinished_fields.add("checkpoint_root_dir")
+            st.warning("Please input checkpoint root dir.")
+        elif not os.path.isabs(st.session_state["checkpoint_root_dir"].strip()):
+            self.unfinished_fields.add("checkpoint_root_dir")
             st.warning("Please input an absolute path.")
 
     def _set_node_num(self):
@@ -346,8 +361,9 @@ def _set_taskset_args(self):
             response_key_col.text_input(
                 "Response Key :orange-badge[(Needs review)]", key="taskset_response_key"
             )
+            self._set_configs_with_st_columns(["temperature", "logprobs"])
 
-    def _set_eval_taskset_idx(self, idx):
+    def _set_eval_taskset_idx(self, idx):  # TODO: add delete
         st.text_input(
             "Taskset Name",
             key=f"eval_taskset_{idx}_name",
@@ -457,7 +473,7 @@ def _set_experience_buffer_path(self):  # TODO
 
 if `storage_type == StorageType.QUEUE`, default to `None`,
 
-if `storage_type == StorageType.SQL`, default to `sqlite:///{os.path.join(checkpoint_path, '.cache', project_name, experiment_name)}/data.db`."""
+if `storage_type == StorageType.SQL`, default to `sqlite:///{os.path.join(checkpoint_root_dir, '.cache', project_name, experiment_name)}/data.db`."""
 
         def on_change():
             if st.session_state["algorithm_type"] == AlgorithmType.DPO.value:
@@ -545,7 +561,9 @@ def _set_sft_warmup_dataset_args(self):
                 sft_warmup_messages_key_col,
                 sft_warmup_prompt_key_col,
                 sft_warmup_response_key_col,
-            ) = st.columns(3)
+            ) = st.columns(
+                3
+            )  # TODO: select by prompt type
             sft_warmup_messages_key_col.text_input(
                 "SFT Dataset Messages Key :orange-badge[(Needs review)]",
                 key="sft_warmup_messages_key",
@@ -572,7 +590,7 @@ def _str_for_engine_num_and_tp_size(self):
 ```"""
 
     def _set_engine_num(self):
-        total_gpu_num = st.session_state["gpu_per_node"] * st.session_state["node_num"]
+        total_gpu_num = st.session_state["total_gpu_num"]
         max_engine_num = (total_gpu_num - 1) // st.session_state["tensor_parallel_size"]
         if st.session_state["engine_num"] > max_engine_num:
             st.session_state["engine_num"] = max_engine_num
@@ -588,7 +606,7 @@ def _set_engine_num(self):
         )
 
     def _set_tensor_parallel_size(self):
-        total_gpu_num = st.session_state["gpu_per_node"] * st.session_state["node_num"]
+        total_gpu_num = st.session_state["total_gpu_num"]
         max_tensor_parallel_size = (total_gpu_num - 1) // st.session_state["engine_num"]
         if st.session_state["tensor_parallel_size"] > max_tensor_parallel_size:
             st.session_state["tensor_parallel_size"] = max_tensor_parallel_size
@@ -620,6 +638,33 @@ def _check_engine_num_and_tp_size(self):
                     "Please ensure that `engine_num * tensor_parallel_size` can be divided by `gpu_per_node` when `node_num > 1`."
                 )
 
+    def _set_repeat_times(self):  # TODO
+        grouped_adv_algorithms = [
+            AlgorithmType.GRPO.value,
+            AlgorithmType.OPMD.value,  # TODO: may add rloo
+        ]
+        if st.session_state["algorithm_type"] in grouped_adv_algorithms:
+            min_repeat_times = 2
+            st.session_state["repeat_times"] = st.session_state["_grouped_adv_repeat_times"]
+        else:
+            min_repeat_times = 1
+            st.session_state["repeat_times"] = st.session_state["_not_grouped_adv_repeat_times"]
+
+        def on_change():
+            if st.session_state["algorithm_type"] in grouped_adv_algorithms:
+                st.session_state["_grouped_adv_repeat_times"] = st.session_state["repeat_times"]
+            else:
+                st.session_state["_not_grouped_adv_repeat_times"] = st.session_state["repeat_times"]
+
+        st.number_input(
+            "Repeat Times",
+            key="repeat_times",
+            min_value=min_repeat_times,
+            help="`repeat_times` is used to set how many experiences each task can generate, "
+            "and it must be greater than `1` when `algorithm_type` is `opmd` or `grpo`.",
+            on_change=on_change,
+        )
+
     def _set_sync_method(self):
         if st.session_state["algorithm_type"] == AlgorithmType.DPO.value:
             st.session_state["sync_method"] = SyncMethod.CHECKPOINT.value
@@ -686,6 +731,9 @@ def _set_seed(self):
     def _set_logprobs(self):
         st.number_input("Logprobs", key="logprobs", min_value=0, max_value=20)
 
+    def _set_use_v1(self):
+        st.checkbox("Use V1 Engine", key="use_v1")
+
     def _set_enable_prefix_caching(self):
         st.checkbox("Prefix Caching", key="enable_prefix_caching")
 
@@ -700,6 +748,12 @@ def _set_gpu_memory_utilization(self):
     def _set_enable_chunked_prefill(self):
         st.checkbox("Chunked Prefill", key="enable_chunked_prefill")
 
+    def _set_enable_thinking(self):
+        st.checkbox("Enable Thinking For Qwen3", key="enable_thinking")
+
+    def _set_enable_openai_api(self):
+        st.checkbox("Enable OpenAI API", key="enable_openai_api")
+
     def _set_max_timeout(self):
         st.number_input("Max Timeout", key="max_timeout", min_value=0)
 
@@ -745,6 +799,9 @@ def _set_sft_warmup_steps(self):
     def _set_eval_interval(self):
         st.number_input("Eval Interval", key="eval_interval", min_value=1)
 
+    def _set_eval_on_latest_checkpoint(self):
+        st.checkbox("Eval on Latest Checkpoint", key="eval_on_latest_ckp")
+
     def _set_training_args(self):
         st.multiselect(
             "Training Args",
@@ -787,33 +844,6 @@ def on_change():
     def _set_ppo_epochs(self):
         st.number_input("PPO Epochs", key="ppo_epochs", min_value=1)
 
-    def _set_repeat_times(self):  # TODO
-        grouped_adv_algorithms = [
-            AlgorithmType.GRPO.value,
-            AlgorithmType.OPMD.value,  # TODO: may add rloo
-        ]
-        if st.session_state["algorithm_type"] in grouped_adv_algorithms:
-            min_repeat_times = 2
-            st.session_state["repeat_times"] = st.session_state["_grouped_adv_repeat_times"]
-        else:
-            min_repeat_times = 1
-            st.session_state["repeat_times"] = st.session_state["_not_grouped_adv_repeat_times"]
-
-        def on_change():
-            if st.session_state["algorithm_type"] in grouped_adv_algorithms:
-                st.session_state["_grouped_adv_repeat_times"] = st.session_state["repeat_times"]
-            else:
-                st.session_state["_not_grouped_adv_repeat_times"] = st.session_state["repeat_times"]
-
-        st.number_input(
-            "Repeat Times",
-            key="repeat_times",
-            min_value=min_repeat_times,
-            help="`repeat_times` is used to set how many experiences each task can generate, "
-            "and it must be greater than `1` when `algorithm_type` is `opmd` or `grpo`.",
-            on_change=on_change,
-        )
-
     def _set_training_strategy(self):
         st.selectbox(
             "Training Strategy",
@@ -1105,11 +1135,11 @@ def _set_configs_with_st_columns(
 
     def beginner_mode(self):
         st.header("Essential Configs")
-        self._set_configs_with_st_columns(["project", "name"], columns_config=[1, 3])
+        self._set_configs_with_st_columns(["project", "exp_name"], columns_config=[1, 3])
 
         self._set_model_path()
 
-        self._set_checkpoint_path()
+        self._set_checkpoint_root_dir()
 
         self._set_taskset_path()
 
@@ -1169,12 +1199,12 @@ def beginner_mode(self):
             self._set_configs_with_st_columns(["critic_ppo_micro_batch_size_per_gpu", "critic_lr"])
 
     def _expert_model_part(self):
-        self._set_configs_with_st_columns(["project", "name"], columns_config=[1, 3])
+        self._set_configs_with_st_columns(["project", "exp_name"], columns_config=[1, 3])
 
         self._set_model_path()
         self._set_critic_model_path()
 
-        self._set_checkpoint_path()
+        self._set_checkpoint_root_dir()
 
         self._set_configs_with_st_columns(["monitor_type", "node_num", "gpu_per_node"])
         self._set_configs_with_st_columns(["max_prompt_tokens", "max_response_tokens"])
@@ -1213,34 +1243,36 @@ def _expert_buffer_part(self):
             self._set_configs_with_st_columns(["buffer_max_retry_times", "max_retry_interval"])
 
     def _expert_explorer_part(self):
+        self._set_configs_with_st_columns(["sync_method", "sync_interval", "sync_timeout"])
+
         self._set_configs_with_st_columns(
-            ["engine_type", "engine_num", "tensor_parallel_size", "repeat_times"]
+            [
+                "runner_num",
+                "max_timeout",
+                "explorer_max_retry_times",
+            ]
         )
-        self._check_engine_num_and_tp_size()
 
-        self._set_configs_with_st_columns(["sync_method", "sync_interval", "sync_timeout"])
+        self._set_configs_with_st_columns(["eval_interval", "eval_on_latest_checkpoint"])
 
-        with st.expander("Advanced Config"):
-            self._set_configs_with_st_columns(
-                ["runner_num", "temperature", "top_p", "top_k", "seed", "logprobs"]
-            )
+        with st.expander("Rollout Model Config", expanded=True):
+            self._set_configs_with_st_columns(["engine_type", "engine_num", "tensor_parallel_size"])
+            self._check_engine_num_and_tp_size()
 
-            self._set_configs_with_st_columns(["dtype", "gpu_memory_utilization"])
-            self._set_configs_with_st_columns(
-                [
-                    "max_timeout",
-                    "explorer_max_retry_times",
-                ]
-            )
+            self._set_configs_with_st_columns(["gpu_memory_utilization", "dtype", "seed"])
 
             self._set_configs_with_st_columns(
-                ["enable_prefix_caching", "enforce_eager", "enable_chunked_prefill"]
+                ["use_v1", "enforce_eager", "enable_prefix_caching", "enable_chunked_prefill"]
             )
 
+            self._set_configs_with_st_columns(["enable_thinking", "enable_openai_api"])
+
+        with st.expander("Auxiliary Models", expanded=True):  # TODO
+            pass
+
     def _expert_trainer_part(self):
-        self._set_configs_with_st_columns(  # TODO: may add `trainer_type`
-            ["algorithm_type", "sft_warmup_steps", "eval_interval", "save_interval"]
-        )
+        self._set_configs_with_st_columns(["algorithm_type", "gamma", "lam"])
+        self._set_configs_with_st_columns(["repeat_times", "save_interval"])
         self._check_sft_warmup_dataset_path()
 
         if st.session_state["trainer_type"] == "verl":
@@ -1280,7 +1312,6 @@ def _expert_verl_trainer_part(self):
 
         with rl_algorithm_tab:
             st.subheader("RL Algorithm Config")
-            self._set_configs_with_st_columns(["gamma", "lam"])
             self._set_configs_with_st_columns(["norm_adv_by_std_in_grpo", "use_kl_in_reward"])
             self._set_configs_with_st_columns(["kl_penalty", "kl_ctrl_type", "kl_ctrl_coef"])
             self._set_configs_with_st_columns(["horizon", "target_kl"])
@@ -1341,7 +1372,7 @@ def expert_mode(self):
             with tab:
                 func()
 
-    def _generate_verl_config(self, trainer_nnodes: int = 1, trainer_n_gpus_per_node: int = 8):
+    def _generate_verl_config(self):
         balance_batch = "balance_batch" in st.session_state["training_args"]
         enable_gradient_checkpointing = (
             "gradient_checkpointing" in st.session_state["training_args"]
@@ -1363,33 +1394,10 @@ def _generate_verl_config(self, trainer_nnodes: int = 1, trainer_n_gpus_per_node
             st.session_state["max_prompt_tokens"] + st.session_state["max_response_tokens"]
         )
 
-        critic_model_path = (
-            st.session_state["critic_model_path"].strip()
-            if st.session_state["critic_model_path"].strip()
-            else st.session_state["model_path"]
-        )
         trainer_config = {
-            "data": {
-                "tokenizer": None,
-                "train_files": "placeholder",
-                "val_files": "placeholder",
-                "prompt_key": "placeholder",
-                "max_prompt_length": st.session_state["max_prompt_tokens"],
-                "max_response_length": st.session_state["max_response_tokens"],
-                "train_batch_size": st.session_state["train_batch_size"]
-                * st.session_state["repeat_times"],
-                "val_batch_size": None,
-                "return_raw_input_ids": False,
-                "return_raw_chat": False,
-                "shuffle": True,
-                "filter_overlong_prompts": False,
-                "truncation": "error",
-                "image_key": "images",
-            },
             "actor_rollout_ref": {
                 "hybrid_engine": True,
                 "model": {
-                    "path": st.session_state["model_path"],
                     "external_lib": None,
                     "override_config": {},
                     "enable_gradient_checkpointing": enable_gradient_checkpointing,
@@ -1403,11 +1411,6 @@ def _generate_verl_config(self, trainer_nnodes: int = 1, trainer_n_gpus_per_node
                     ],
                     "use_dynamic_bsz": use_dynamic_bsz,
                     "ppo_max_token_len_per_gpu": ppo_max_token_len_per_gpu,
-                    "grad_clip": st.session_state["actor_grad_clip"],
-                    "clip_ratio": st.session_state["actor_clip_ratio"],
-                    "entropy_coeff": st.session_state["actor_entropy_coef"],
-                    "use_kl_loss": st.session_state["actor_use_kl_loss"],
-                    "kl_loss_coef": st.session_state["actor_kl_loss_coef"],
                     "kl_loss_type": st.session_state["actor_kl_loss_type"],
                     "ppo_epochs": st.session_state["ppo_epochs"],
                     "shuffle": False,
@@ -1441,34 +1444,32 @@ def _generate_verl_config(self, trainer_nnodes: int = 1, trainer_n_gpus_per_node
                         "actor_ulysses_sequence_parallel_size"
                     ],
                 },
-                "rollout": {
-                    "name": "vllm",
-                    "temperature": st.session_state["temperature"],
-                    "top_k": -1,
-                    "top_p": 1,
-                    "use_fire_sampling": False,
-                    "prompt_length": st.session_state["max_prompt_tokens"],
-                    "response_length": st.session_state["max_response_tokens"],
-                    "dtype": "bfloat16",
-                    "gpu_memory_utilization": 0.4,
-                    "ignore_eos": False,
-                    "enforce_eager": True,
-                    "free_cache_engine": True,
-                    "load_format": "dummy_dtensor",
-                    "tensor_model_parallel_size": 2,
-                    "max_num_batched_tokens": 8192,
-                    "max_model_len": None,
-                    "max_num_seqs": 1024,
-                    "log_prob_micro_batch_size_per_gpu": 4,
-                    "log_prob_use_dynamic_bsz": use_dynamic_bsz,
-                    "log_prob_max_token_len_per_gpu": ppo_max_token_len_per_gpu,
-                    "disable_log_stats": True,
-                    "enable_chunked_prefill": True,
-                    "do_sample": True,
-                    "n": st.session_state["repeat_times"],
+            },
+            "custom_reward_function": {"path": None, "name": "compute_score"},
+            "algorithm": {
+                "kl_penalty": st.session_state["kl_penalty"],
+                "kl_ctrl": {
+                    "type": st.session_state["kl_ctrl_type"],
+                    "kl_coef": st.session_state["kl_ctrl_coef"],
                 },
             },
-            "critic": {
+            "trainer": {
+                "balance_batch": balance_batch,
+                "logger": ["tensorboard"],
+                "resume_mode": st.session_state["resume_mode"],
+                "resume_from_path": st.session_state["resume_from_path"],
+                "default_hdfs_dir": st.session_state["default_hdfs_dir"],
+                "remove_previous_ckpt_in_save": st.session_state["remove_previous_ckpt_in_save"],
+                "del_local_ckpt_after_load": st.session_state["del_local_ckpt_after_load"],
+                "val_before_train": False,
+                "max_actor_ckpt_to_keep": st.session_state["max_actor_ckpt_to_keep"],
+                "max_critic_ckpt_to_keep": st.session_state["max_critic_ckpt_to_keep"],
+            },
+        }
+
+        if st.session_state["adv_estimator"] == AdvantageEstimator.GAE.value:
+            trainer_config["trainer"]["critic_warmup"] = st.session_state["critic_warmup"]
+            trainer_config["critic"] = {
                 "strategy": st.session_state["training_strategy"],
                 "optim": {
                     "lr": st.session_state["critic_lr"],
@@ -1481,8 +1482,6 @@ def _generate_verl_config(self, trainer_nnodes: int = 1, trainer_n_gpus_per_node
                     ),
                 },
                 "model": {
-                    "path": critic_model_path,
-                    "tokenizer_path": critic_model_path,
                     "override_config": {},
                     "external_lib": None,
                     "enable_gradient_checkpointing": enable_gradient_checkpointing,
@@ -1507,98 +1506,129 @@ def _generate_verl_config(self, trainer_nnodes: int = 1, trainer_n_gpus_per_node
                 "grad_clip": st.session_state["critic_grad_clip"],
                 "cliprange_value": st.session_state["critic_cliprange_value"],
                 "checkpoint": {"contents": st.session_state["critic_checkpoint"]},
-            },
-            "reward_model": {
-                "enable": False,
-                "strategy": "fsdp",
-                "model": {
-                    "input_tokenizer": st.session_state["model_path"],
-                    "path": "~/models/FsfairX-LLaMA3-RM-v0.1",
-                    "external_lib": None,
-                    "use_remove_padding": False,
-                    "fsdp_config": {
-                        "min_num_params": 0,
-                        "param_offload": False,
-                        "fsdp_size": -1,
-                    },
-                },
-                "ulysses_sequence_parallel_size": 1,
-                "use_dynamic_bsz": use_dynamic_bsz,
-                "forward_max_token_len_per_gpu": ppo_max_token_len_per_gpu * 2,
-                "reward_manager": "naive",
-            },
-            "custom_reward_function": {"path": None, "name": "compute_score"},
-            "algorithm": {
-                "gamma": st.session_state["gamma"],
-                "lam": st.session_state["lam"],
-                "adv_estimator": st.session_state["adv_estimator"],
-                "kl_penalty": st.session_state["kl_penalty"],
-                "kl_ctrl": {
-                    "type": st.session_state["kl_ctrl_type"],
-                    "kl_coef": st.session_state["kl_ctrl_coef"],
-                },
-            },
-            "trainer": {
-                "balance_batch": balance_batch,
-                "total_epochs": st.session_state["total_epochs"],
-                "project_name": st.session_state["project"],
-                "experiment_name": st.session_state["exp_name"],
-                "logger": ["tensorboard"],
-                "val_generations_to_log_to_wandb": 0,
-                "nnodes": trainer_nnodes,
-                "n_gpus_per_node": trainer_n_gpus_per_node,
-                "save_freq": st.session_state["save_interval"],
-                "resume_mode": st.session_state["resume_mode"],
-                "resume_from_path": st.session_state["resume_from_path"],
-                "test_freq": 100,
-                "critic_warmup": st.session_state["critic_warmup"],
-                "default_hdfs_dir": st.session_state["default_hdfs_dir"],
-                "remove_previous_ckpt_in_save": st.session_state["remove_previous_ckpt_in_save"],
-                "del_local_ckpt_after_load": st.session_state["del_local_ckpt_after_load"],
-                "default_local_dir": st.session_state["checkpoint_path"],
-                "val_before_train": False,
-                "sync_freq": st.session_state["sync_interval"],
-                "max_actor_ckpt_to_keep": st.session_state["max_actor_ckpt_to_keep"],
-                "max_critic_ckpt_to_keep": st.session_state["max_critic_ckpt_to_keep"],
-            },
-        }
+            }
         return trainer_config
 
-    def generate_config(self):
-        if st.session_state["mode"] == "both":
-            trainer_nnodes = (
-                st.session_state["node_num"]
-                - st.session_state["engine_num"]
-                * st.session_state["tensor_parallel_size"]
-                // st.session_state["gpu_per_node"]
-            )
-        else:
-            trainer_nnodes = st.session_state["node_num"]
-        if st.session_state["node_num"] == 1 and st.session_state["mode"] == "both":
-            trainer_n_gpus_per_node = (
-                st.session_state["gpu_per_node"]
-                - st.session_state["engine_num"] * st.session_state["tensor_parallel_size"]
-            )
-        else:
-            trainer_n_gpus_per_node = st.session_state["gpu_per_node"]
-
+    def _gen_buffer_config(self):
         if st.session_state["algorithm_type"] != AlgorithmType.DPO.value:
             experience_buffer_path = st.session_state["experience_buffer_path"].strip()
             if (
                 not experience_buffer_path
                 and st.session_state["storage_type"] == StorageType.SQL.value
             ):
-                experience_buffer_path = f"sqlite:///{os.path.join(st.session_state['checkpoint_path'], '.cache', st.session_state['project'], st.session_state['exp_name'])}/data.db"
+                experience_buffer_path = f"sqlite:///{os.path.join(st.session_state['checkpoint_root_dir'], '.cache', st.session_state['project'], st.session_state['exp_name'])}/data.db"
 
         sft_storage_type = (
             StorageType.SQL.value
             if "://" in st.session_state["sft_warmup_dataset_path"]
             else StorageType.FILE.value
         )  # TODO
+
+        buffer_config = {
+            "batch_size": st.session_state["train_batch_size"],
+            "total_epochs": st.session_state["total_epochs"],
+            "explorer_input": {
+                "taskset": {
+                    "name": "taskset",
+                    "storage_type": StorageType.FILE.value,
+                    "path": st.session_state["taskset_path"],
+                    "split": st.session_state["taskset_split"],
+                    "subset_name": st.session_state["taskset_subset_name"],
+                    "format": {
+                        "prompt_key": st.session_state["taskset_prompt_key"],
+                        "response_key": st.session_state["taskset_response_key"],
+                    },
+                    "rollout_args": {
+                        "temperature": st.session_state["temperature"],
+                        "logprobs": st.session_state["logprobs"],
+                    },
+                },
+                "eval_tasksets": [],
+                "default_workflow_type": st.session_state["default_workflow_type"],
+                "default_reward_fn_type": st.session_state["default_reward_fn_type"],
+                "system_prompt": st.session_state["system_prompt"],
+                "reply_prefix": st.session_state["reply_prefix"],
+            },
+            "trainer_input": {
+                "experience_buffer": {
+                    "name": "experience_buffer",
+                    "storage_type": st.session_state["storage_type"],
+                    "path": experience_buffer_path,
+                },
+                "sft_warmup_steps": st.session_state["sft_warmup_steps"],
+            },
+            "max_retry_times": st.session_state["buffer_max_retry_times"],
+            "max_retry_interval": st.session_state["max_retry_interval"],
+        }
+
+        for idx in range(st.session_state["_eval_tasksets_num"]):
+            if st.session_state[f"eval_taskset_{idx}_path"].strip():
+                buffer_config["explorer_input"]["eval_tasksets"].append(
+                    {
+                        "name": st.session_state[f"eval_taskset_{idx}_name"],
+                        "path": st.session_state[f"eval_taskset_{idx}_path"],
+                        "subset_name": st.session_state[f"eval_taskset_{idx}_subset_name"],
+                        "split": st.session_state[f"eval_taskset_{idx}_split"],
+                        "prompt_key": st.session_state[f"eval_taskset_{idx}_prompt_key"],
+                        "response_key": st.session_state[f"eval_taskset_{idx}_response_key"],
+                    }
+                )
+        if st.session_state["algorithm_type"] == AlgorithmType.DPO.value:
+            experience_buffer = buffer_config["trainer_input"]["experience_buffer"]
+            experience_buffer["split"] = st.session_state["dpo_dataset_train_split"]
+            experience_buffer["format"] = {
+                "prompt_type": st.session_state["dpo_dataset_prompt_type"],
+                "prompt_key": st.session_state["dpo_dataset_prompt_key"],
+                "chosen_key": st.session_state["dpo_dataset_chosen_key"],
+                "rejected_key": st.session_state["dpo_dataset_rejected_key"],
+            }
+        if st.session_state["sft_warmup_dataset_path"].strip():
+            buffer_config["trainer_input"]["sft_warmup_dataset"] = {
+                "name": "sft_warmup_dataset",
+                "storage_type": sft_storage_type,
+                "path": st.session_state["sft_warmup_dataset_path"],
+                "split": st.session_state["sft_warmup_train_split"],
+                "format": {
+                    "prompt_type": st.session_state["sft_warmup_prompt_type"],
+                    "messages_key": st.session_state["sft_warmup_messages_key"],
+                    "prompt_key": st.session_state["sft_warmup_prompt_key"],
+                    "response_key": st.session_state["sft_warmup_response_key"],
+                },
+            }
+
+        return buffer_config
+
+    def _gen_explorer_config(self):
+        explorer_config = {
+            "runner_num": st.session_state["runner_num"],
+            "max_timeout": st.session_state["max_timeout"],
+            "max_retry_times": st.session_state["explorer_max_retry_times"],
+            "rollout_model": {
+                "engine_type": st.session_state["engine_type"],
+                "engine_num": st.session_state["engine_num"],
+                "tensor_parallel_size": st.session_state["tensor_parallel_size"],
+                "use_v1": st.session_state["use_v1"],
+                "enforce_eager": st.session_state["enforce_eager"],
+                "enable_prefix_caching": st.session_state["enable_prefix_caching"],
+                "enable_chunked_prefill": st.session_state["enable_chunked_prefill"],
+                "gpu_memory_utilization": st.session_state["gpu_memory_utilization"],
+                "dtype": st.session_state["dtype"],
+                "seed": st.session_state["seed"],
+                # "max_prompt_tokens": None,  # TODO
+                # "max_response_tokens": None,  # TODO
+                # "chat_template": None,  # TODO: add chat template
+                "enable_thinking": st.session_state["enable_thinking"],
+                "enable_openai_api": st.session_state["enable_openai_api"],
+            },
+            "auxiliary_models": [],
+            "eval_interval": st.session_state["eval_interval"],
+            "eval_on_latest_checkpoint": st.session_state["eval_on_latest_checkpoint"],
+        }
+        return explorer_config
+
+    def generate_config(self):
         if st.session_state["trainer_type"] == "verl":
-            trainer_config = self._generate_verl_config(
-                trainer_nnodes=trainer_nnodes, trainer_n_gpus_per_node=trainer_n_gpus_per_node
-            )
+            trainer_config = self._generate_verl_config()
         else:
             raise ValueError(f"Invalid trainer type: {st.session_state['trainer_type']}")
 
@@ -1623,12 +1653,15 @@ def generate_config(self):
             config = {
                 "mode": st.session_state["mode"],
                 "project": st.session_state["project"],
-                "name": st.session_state["name"],
-                "checkpoint_root_dir": st.session_state["checkpoint_path"],
+                "name": st.session_state["exp_name"],
+                "checkpoint_root_dir": st.session_state["checkpoint_root_dir"],
                 "algorithm": {
                     "algorithm_type": st.session_state["algorithm_type"],
                     "repeat_times": st.session_state["repeat_times"],
+                    "gamma": st.session_state["gamma"],
+                    "lam": st.session_state["lam"],
                 },
+                "data_processor": {},  # TODO: Add data processor config
                 "model": {
                     "model_path": st.session_state["model_path"],
                     "max_prompt_tokens": st.session_state["max_prompt_tokens"],
@@ -1638,75 +1671,27 @@ def generate_config(self):
                     "node_num": st.session_state["node_num"],
                     "gpu_per_node": st.session_state["gpu_per_node"],
                 },
-                "buffer": {
-                    "total_epochs": st.session_state["total_epochs"],
-                    "batch_size": st.session_state["train_batch_size"],
-                    "max_retry_times": st.session_state["buffer_max_retry_times"],
-                    "max_retry_interval": st.session_state["max_retry_interval"],
-                    "explorer_input": {
-                        "taskset": {
-                            "name": "taskset",
-                            "storage_type": StorageType.FILE.value,
-                            "path": st.session_state["taskset_path"],
-                            "split": st.session_state["taskset_split"],
-                            "subset_name": st.session_state["taskset_subset_name"],
-                            "format": {
-                                "prompt_key": st.session_state["taskset_prompt_key"],
-                                "response_key": st.session_state["taskset_response_key"],
-                            },
-                            "rollout_args": {
-                                "n": st.session_state["repeat_times"],
-                                "temperature": st.session_state["temperature"],
-                                "top_p": st.session_state["top_p"],
-                                "top_k": st.session_state["top_k"],
-                                "logprobs": st.session_state["logprobs"],
-                            },
-                        },
-                        "eval_tasksets": [],  # TODO: add eval tasksets
-                        "default_workflow_type": st.session_state["default_workflow_type"],
-                        "default_reward_fn_type": st.session_state["default_reward_fn_type"],
-                        "system_prompt": st.session_state["system_prompt"],
-                        "reply_prefix": st.session_state["reply_prefix"],
-                    },
-                    "trainer_input": {
-                        "experience_buffer": {
-                            "name": "experience_buffer",
-                            "storage_type": st.session_state["storage_type"],
-                            "path": experience_buffer_path,
-                        },
-                        "sft_warmup_steps": st.session_state["sft_warmup_steps"],
-                    },
-                },
-                "explorer": {
-                    "eval_interval": st.session_state["eval_interval"],
-                    "engine_type": st.session_state["engine_type"],
-                    "engine_num": st.session_state["engine_num"],
-                    "runner_num": st.session_state["runner_num"],
-                    # "chat_template": None,  # TODO: add chat template
-                    "tensor_parallel_size": st.session_state["tensor_parallel_size"],
-                    "enable_prefix_caching": st.session_state["enable_prefix_caching"],
-                    "enforce_eager": st.session_state["enforce_eager"],
-                    "dtype": st.session_state["dtype"],
-                    "seed": st.session_state["seed"],
-                    "gpu_memory_utilization": st.session_state["gpu_memory_utilization"],
-                    "enable_chunked_prefill": st.session_state["enable_chunked_prefill"],
-                    "use_v1": True,
-                    "max_timeout": st.session_state["max_timeout"],
-                    "max_retry_times": st.session_state["explorer_max_retry_times"],
-                },
-                "synchronizer": {
-                    "sync_method": st.session_state["sync_method"],
-                    "sync_interval": st.session_state["sync_interval"],
-                    "sync_timeout": st.session_state["sync_timeout"],
-                },
+                "buffer": self._gen_buffer_config(),
+                "explorer": self._gen_explorer_config(),
                 "trainer": {
                     "trainer_type": st.session_state["trainer_type"],
-                    "trainer_config": trainer_config,
                     "save_interval": st.session_state["save_interval"],
+                    "enable_preview": True,  # TODO
+                    "actor_use_kl_loss": st.session_state["actor_use_kl_loss"],
+                    "actor_kl_loss_coef": st.session_state["actor_kl_loss_coef"],
+                    "actor_entropy_coef": st.session_state["actor_entropy_coef"],
+                    "actor_grad_clip": st.session_state["actor_grad_clip"],
+                    "actor_clip_ratio": st.session_state["actor_clip_ratio"],
+                    "trainer_config": trainer_config,
                 },
                 "monitor": {
                     "monitor_type": st.session_state["monitor_type"],
                 },
+                "synchronizer": {
+                    "sync_method": st.session_state["sync_method"],
+                    "sync_interval": st.session_state["sync_interval"],
+                    "sync_timeout": st.session_state["sync_timeout"],
+                },
             }
 
             if st.session_state["adv_estimator"] == AdvantageEstimator.GAE.value:
@@ -1716,40 +1701,6 @@ def generate_config(self):
                     else st.session_state["model_path"]
                 )
 
-            for idx in range(st.session_state["_eval_tasksets_num"]):
-                if st.session_state[f"eval_taskset_{idx}_path"].strip():
-                    config["buffer"]["explorer_input"]["eval_tasksets"].append(
-                        {
-                            "name": st.session_state[f"eval_taskset_{idx}_name"],
-                            "path": st.session_state[f"eval_taskset_{idx}_path"],
-                            "subset_name": st.session_state[f"eval_taskset_{idx}_subset_name"],
-                            "split": st.session_state[f"eval_taskset_{idx}_split"],
-                            "prompt_key": st.session_state[f"eval_taskset_{idx}_prompt_key"],
-                            "response_key": st.session_state[f"eval_taskset_{idx}_response_key"],
-                        }
-                    )
-            if st.session_state["algorithm_type"] == AlgorithmType.DPO.value:
-                experience_buffer = config["buffer"]["trainer_input"]["experience_buffer"]
-                experience_buffer["split"] = st.session_state["dpo_dataset_train_split"]
-                experience_buffer["format"] = {
-                    "prompt_type": st.session_state["dpo_dataset_prompt_type"],
-                    "prompt_key": st.session_state["dpo_dataset_prompt_key"],
-                    "chosen_key": st.session_state["dpo_dataset_chosen_key"],
-                    "rejected_key": st.session_state["dpo_dataset_rejected_key"],
-                }
-            if st.session_state["sft_warmup_dataset_path"].strip():
-                config["buffer"]["trainer_input"]["sft_warmup_dataset"] = {
-                    "name": "sft_warmup_dataset",
-                    "storage_type": sft_storage_type,
-                    "path": st.session_state["sft_warmup_dataset_path"],
-                    "split": st.session_state["sft_warmup_train_split"],
-                    "format": {
-                        "prompt_type": st.session_state["sft_warmup_prompt_type"],
-                        "messages_key": st.session_state["sft_warmup_messages_key"],
-                        "prompt_key": st.session_state["sft_warmup_prompt_key"],
-                        "response_key": st.session_state["sft_warmup_response_key"],
-                    },
-                }
             st.session_state.config_generated = True
             st.header("Generated Config File")
             buttons = st.container()
@@ -1758,7 +1709,7 @@ def generate_config(self):
             save_btn.download_button(
                 "Save",
                 data=yaml_config,
-                file_name=f"{config['monitor']['project']}-{config['monitor']['name']}.yaml",
+                file_name=f"{config['project']}-{config['name']}.yaml",
                 mime="text/plain",
                 icon=":material/download:",
                 use_container_width=True,
diff --git a/trinity/trainer/verl_trainer.py b/trinity/trainer/verl_trainer.py
index 090a5ff881..7590d6075b 100644
--- a/trinity/trainer/verl_trainer.py
+++ b/trinity/trainer/verl_trainer.py
@@ -416,17 +416,6 @@ def train_rft_step(self, experiences: Experiences) -> Tuple[bool, int]:
                 actor_output_metrics = reduce_metrics(actor_output.meta_info["metrics"])
                 metrics.update(actor_output_metrics)
 
-            # validate
-            if (
-                self.val_reward_fn is not None
-                and self.config.trainer.test_freq > 0
-                and self.global_steps % self.config.trainer.test_freq == 0
-            ):
-                pass  # TODO: may add validation
-                # with _timer("testing", timing_raw):
-                #     val_metrics: dict = self._validate()
-                # metrics.update(val_metrics)
-
             if (
                 self.config.trainer.save_freq > 0
                 and self.global_steps % self.config.trainer.save_freq == 0