modelscope · garyzhang99 · Apr 22, 2025 · Apr 22, 2025 · Apr 22, 2025 · Apr 22, 2025
diff --git a/docs/sphinx_doc/source/tutorial/example_multi_turn.md b/docs/sphinx_doc/source/tutorial/example_multi_turn.md
@@ -21,7 +21,7 @@ You may refer to their original environment to complete the setup.
 ### Data Preparation
 Our dataset follows the format in Huggingface datasets library, so we should correspondingly convert our env dataset.
 
-Just run the following command.
+Just check the data preparation scripts and run the following command.
 ```bash
 # For ALFworld env
 python scripts/data_prepare/get_alfworld_data.py
@@ -53,18 +53,16 @@ We provide an easy way to allow you build your own environment pipeline by creat
 
 See the `trinity/common/workflows/envs/alfworld/alfworld_workflow.py` as an example on how to construct a multi-round workflow.
 
-You can interact with environment using the messages format, and call the `self.process_batch_messages` function to transform the messages and rewards into the `experience` we need, and send them to buffer.
+You can interact with environment using the messages format, and call the `self.process_messages_to_experience` function to transform the messages and rewards into the `experience` we need, and send them to buffer.
 
 ```python
-class AlfworldWorkflow(Workflow):
+class AlfworldWorkflow(MultiTurnWorkflow):
     """A workflow for alfworld task."""
     ...
 
     def generate_env_inference_samples(self, env, rollout_num) -> List[Experience]:
         print("Generating env inference samples...")
-        all_messages = []
-        all_rewards = []
-        all_infos = []
+        experience_list = []
         for i in range(rollout_num):
             observation, info = env.reset()
             final_reward = -0.1
@@ -80,14 +78,13 @@ class AlfworldWorkflow(Workflow):
                 if done:
                     final_reward = reward
                     break
-            all_infos.append(
-                {"env_rounds": r, "env_done": 1 if done else 0}
+            experience = self.process_messages_to_experience(
+                memory, final_reward, {"env_rounds": r, "env_done": 1 if done else 0}
             )
-            all_messages.append(memory)
-            all_rewards.append(final_reward)
+            experience_list.append(experience)
         # Close the env to save cpu memory
         env.close()
-        return self.process_batch_messages(all_messages, all_rewards, all_infos=all_infos)
+        return experience_list
 
 
     def run(self) -> List[Experience]:
@@ -102,7 +99,7 @@ class AlfworldWorkflow(Workflow):
 Also, remember to register your workflow:
 ```python
 @WORKFLOWS.register_module("alfworld_workflow")
-class AlfworldWorkflow(Workflow):
+class AlfworldWorkflow(MultiTurnWorkflow):
     """A workflow for alfworld task."""
     ...
 ```

diff --git a/scripts/config/sciworld.yaml b/scripts/config/sciworld.yaml
@@ -0,0 +1,56 @@
+data:
+  total_epoch: 20
+  batch_size: 4
+  dataset_path: 'scripts/data_prepare/sciworld_data'
+  default_workflow_type: 'sciworld_workflow'
+  train_split: 'train'
+  eval_split: ''
+  format_config:
+    prompt_key: 'game_file'
+model:
+  model_path: '/PATH/TO/MODEL/CHECKPOINT/'
+  max_prompt_tokens: 4096
+  max_response_tokens: 16384
+  checkpoint_path: 'checkpoints/sciworld_RFT'
+cluster:
+  node_num: 1
+  gpu_per_node: 8
+buffer:
+  max_retry_times: 3
+  max_retry_interval: 1
+  train_dataset:
+    name: sciworld_buffer
+    storage_type: queue
+    algorithm_type: ppo
+    path: 'sqlite:///sciworld.db'
+explorer:
+  engine_type: vllm_async
+  engine_num: 2
+  runner_num: 32
+  tensor_parallel_size: 2
+  enable_prefix_caching: false
+  enforce_eager: true
+  dtype: bfloat16
+  temperature: 1.0
+  top_p: 1.0
+  top_k: -1
+  seed: 42
+  logprobs: 0
+  repeat_times: 8
+  use_ray: false
+  backend: 'nccl'
+  max_pending_requests: 32
+  max_waiting_steps: 4
+  gpu_memory_utilization: 0.7
+  enable_chunked_prefil: true
+synchronizer:
+  sync_method: 'online'
+  sync_iteration_interval: 8
+trainer:
+  trainer_type: 'verl'
+  algorithm_type: ppo
+  trainer_config_path: 'scripts/config/train_sciworld.yaml'
+monitor:
+  cache_root_dir: ""
+  project: "sciworld"
+  name: "sciworld_RFT"
diff --git a/scripts/config/train_sciworld.yaml b/scripts/config/train_sciworld.yaml
@@ -0,0 +1,183 @@
+data:
+  tokenizer: null
+  train_files: train_example.parquet
+  val_files: test_example.parquet
+  prompt_key: prompt
+  max_prompt_length: 4096
+  max_response_length: 16384
+  train_batch_size: 96
+  val_batch_size: null
+  return_raw_input_ids: False  # This should be set to true when the tokenizer between policy and rm differs
+  return_raw_chat: False
+  shuffle: True
+  filter_overlong_prompts: False # for large-scale dataset, filtering overlong prompts could be timeconsuming. You should disable this and set `truncation='left'
+  truncation: error
+  image_key: images
+
+actor_rollout_ref:
+  hybrid_engine: True
+  model:
+    path: /PATH/TO/MODEL/CHECKPOINT/
+    external_lib: null
+    override_config: { }
+    enable_gradient_checkpointing: True
+    use_remove_padding: False
+  actor:
+    strategy: fsdp  # This is for backward-compatibility
+    ppo_mini_batch_size: 1536
+    # ppo_micro_batch_size: 8 # will be deprecated, use ppo_micro_batch_size_per_gpu
+    ppo_micro_batch_size_per_gpu: 1
+    use_dynamic_bsz: False
+    ppo_max_token_len_per_gpu: 16384 # n * ${data.max_prompt_length} + ${data.max_response_length}
+    grad_clip: 1.0
+    clip_ratio: 0.2
+    entropy_coeff: 0.001
+    use_kl_loss: True # True for GRPO
+    kl_loss_coef: 0.001 # for grpo
+    kl_loss_type: low_var_kl # for grpo
+    ppo_epochs: 1
+    shuffle: False
+    ulysses_sequence_parallel_size: 1 # sp size
+    optim:
+      lr: 1e-6
+      lr_warmup_steps_ratio: 0.  # the total steps will be injected during runtime
+      # min_lr_ratio: null   # only useful for warmup with cosine
+      warmup_style: constant  # select from constant/cosine
+      total_training_steps: -1  # must be override by program
+    fsdp_config:
+      wrap_policy:
+        # transformer_layer_cls_to_wrap: None
+        min_num_params: 0
+      param_offload: False
+      optimizer_offload: False
+      fsdp_size: -1
+  ref:
+    fsdp_config:
+      param_offload: False
+      wrap_policy:
+        # transformer_layer_cls_to_wrap: None
+        min_num_params: 0
+    # log_prob_micro_batch_size: 4 # will be deprecated, use log_prob_micro_batch_size_per_gpu
+    log_prob_micro_batch_size_per_gpu: 1
+    log_prob_use_dynamic_bsz: ${actor_rollout_ref.actor.use_dynamic_bsz}
+    log_prob_max_token_len_per_gpu: ${actor_rollout_ref.actor.ppo_max_token_len_per_gpu}
+    ulysses_sequence_parallel_size: ${actor_rollout_ref.actor.ulysses_sequence_parallel_size} # sp size
+  rollout:
+    name: vllm
+    temperature: 1.0
+    top_k: -1 # 0 for hf rollout, -1 for vllm rollout
+    top_p: 1
+    use_fire_sampling: False # https://arxiv.org/abs/2410.21236
+    prompt_length: ${data.max_prompt_length}  # not use for opensource
+    response_length: ${data.max_response_length}
+    # for vllm rollout
+    dtype: bfloat16 # should align with FSDP
+    gpu_memory_utilization: 0.4
+    ignore_eos: False
+    enforce_eager: True
+    free_cache_engine: True
+    load_format: dummy_dtensor
+    tensor_model_parallel_size: 1
+    max_num_batched_tokens: 8192
+    max_model_len: null
+    max_num_seqs: 1024
+    # log_prob_micro_batch_size: 8 # will be deprecated, use log_prob_micro_batch_size_per_gpu
+    log_prob_micro_batch_size_per_gpu: 1
+    log_prob_use_dynamic_bsz: ${actor_rollout_ref.actor.use_dynamic_bsz}
+    log_prob_max_token_len_per_gpu: ${actor_rollout_ref.actor.ppo_max_token_len_per_gpu}
+    disable_log_stats: True
+    enable_chunked_prefill: True # could get higher throughput
+    # for hf rollout
+    do_sample: True
+    # number of responses (i.e. num sample times)
+    n: 8 # should be > 1 for grpo; Currently is unused parameter
+
+critic:
+  strategy: fsdp
+  optim:
+    lr: 1e-5
+    lr_warmup_steps_ratio: 0.  # the total steps will be injected during runtime
+    # min_lr_ratio: null   # only useful for warmup with cosine
+    warmup_style: constant  # select from constant/cosine
+    total_training_steps: -1  # must be override by program
+  model:
+    path: /PATH/TO/MODEL/CHECKPOINT/
+    tokenizer_path: ${actor_rollout_ref.model.path}
+    override_config: { }
+    external_lib: ${actor_rollout_ref.model.external_lib}
+    enable_gradient_checkpointing: True
+    use_remove_padding: False
+    fsdp_config:
+      param_offload: False
+      optimizer_offload: False
+      wrap_policy:
+        # transformer_layer_cls_to_wrap: None
+        min_num_params: 0
+      fsdp_size: -1
+  ppo_mini_batch_size: ${actor_rollout_ref.actor.ppo_mini_batch_size}
+  # ppo_micro_batch_size: 8 # will be deprecated, use ppo_micro_batch_size_per_gpu
+  ppo_micro_batch_size_per_gpu: 1
+  forward_micro_batch_size: ${critic.ppo_micro_batch_size}
+  forward_micro_batch_size_per_gpu: ${critic.ppo_micro_batch_size_per_gpu}
+  use_dynamic_bsz: ${actor_rollout_ref.actor.use_dynamic_bsz}
+  ppo_max_token_len_per_gpu: 16384 # (${actor_rollout_ref.actor.ppo_max_token_len_per_gpu}) * 2
+  forward_max_token_len_per_gpu: ${critic.ppo_max_token_len_per_gpu}
+  ulysses_sequence_parallel_size: 1 # sp size
+  ppo_epochs: ${actor_rollout_ref.actor.ppo_epochs}
+  shuffle: ${actor_rollout_ref.actor.shuffle}
+  grad_clip: 1.0
+  cliprange_value: 0.5
+
+reward_model:
+  enable: False
+  strategy: fsdp
+  model:
+    input_tokenizer: ${actor_rollout_ref.model.path}  # set this to null if the chat template is identical
+    path: ~/models/FsfairX-LLaMA3-RM-v0.1
+    external_lib: ${actor_rollout_ref.model.external_lib}
+    use_remove_padding: False
+    fsdp_config:
+      min_num_params: 0
+      param_offload: False
+      fsdp_size: -1
+  # micro_batch_size: null # will be deprecated, use micro_batch_size_per_gpu
+  # micro_batch_size_per_gpu: 2 # set a number
+  # max_length: null
+  ulysses_sequence_parallel_size: 1 # sp size
+  use_dynamic_bsz: ${critic.use_dynamic_bsz}
+  forward_max_token_len_per_gpu: ${critic.forward_max_token_len_per_gpu}
+
+custom_reward_function:
+  path: null
+  name: compute_score
+
+algorithm:
+  gamma: 1.0
+  lam: 1.0
+  adv_estimator: grpo
+  kl_penalty: kl  # how to estimate kl divergence
+  kl_ctrl:
+    type: fixed
+    kl_coef: 0.001
+
+trainer:
+  balance_batch: True
+  total_epochs: 15
+  # total_training_steps: null
+  project_name: sciworld
+  experiment_name: sciworld_RFT
+  logger: [ 'wandb' ]
+  val_generations_to_log_to_wandb: 0
+  nnodes: 1
+  n_gpus_per_node: 2
+  save_freq: 1
+  # auto: find the last ckpt to resume. If can't find, start from scratch
+  resume_mode: auto # or auto or resume_path if
+  resume_from_path: False
+  test_freq: 100
+  critic_warmup: 0
+  default_hdfs_dir: null
+  remove_previous_ckpt_in_save: False
+  del_local_ckpt_after_load: False
+  default_local_dir: checkpoints/${trainer.project_name}/${trainer.experiment_name}
+  val_before_train: False
diff --git a/scripts/data_prepare/get_alfworld_data.py b/scripts/data_prepare/get_alfworld_data.py
@@ -39,7 +39,6 @@ def create_dataset_files(output_dir, train_size=1024, test_size=100):
     # create dataset_dict
     dataset_dict = {"train": train_data, "test": test_data}
 
-    # 保存为jsonl格式
     for split, data in dataset_dict.items():
         output_file = os.path.join(output_dir, f"{split}.jsonl")
         with open(output_file, "w") as f: