Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
21 changes: 9 additions & 12 deletions docs/sphinx_doc/source/tutorial/example_multi_turn.md
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ You may refer to their original environment to complete the setup.
### Data Preparation
Our dataset follows the format in Huggingface datasets library, so we should correspondingly convert our env dataset.

Just run the following command.
Just check the data preparation scripts and run the following command.
```bash
# For ALFworld env
python scripts/data_prepare/get_alfworld_data.py
Expand Down Expand Up @@ -53,18 +53,16 @@ We provide an easy way to allow you build your own environment pipeline by creat

See the `trinity/common/workflows/envs/alfworld/alfworld_workflow.py` as an example on how to construct a multi-round workflow.

You can interact with environment using the messages format, and call the `self.process_batch_messages` function to transform the messages and rewards into the `experience` we need, and send them to buffer.
You can interact with environment using the messages format, and call the `self.process_messages_to_experience` function to transform the messages and rewards into the `experience` we need, and send them to buffer.

```python
class AlfworldWorkflow(Workflow):
class AlfworldWorkflow(MultiTurnWorkflow):
"""A workflow for alfworld task."""
...

def generate_env_inference_samples(self, env, rollout_num) -> List[Experience]:
print("Generating env inference samples...")
all_messages = []
all_rewards = []
all_infos = []
experience_list = []
for i in range(rollout_num):
observation, info = env.reset()
final_reward = -0.1
Expand All @@ -80,14 +78,13 @@ class AlfworldWorkflow(Workflow):
if done:
final_reward = reward
break
all_infos.append(
{"env_rounds": r, "env_done": 1 if done else 0}
experience = self.process_messages_to_experience(
memory, final_reward, {"env_rounds": r, "env_done": 1 if done else 0}
)
all_messages.append(memory)
all_rewards.append(final_reward)
experience_list.append(experience)
# Close the env to save cpu memory
env.close()
return self.process_batch_messages(all_messages, all_rewards, all_infos=all_infos)
return experience_list


def run(self) -> List[Experience]:
Expand All @@ -102,7 +99,7 @@ class AlfworldWorkflow(Workflow):
Also, remember to register your workflow:
```python
@WORKFLOWS.register_module("alfworld_workflow")
class AlfworldWorkflow(Workflow):
class AlfworldWorkflow(MultiTurnWorkflow):
"""A workflow for alfworld task."""
...
```
Expand Down
56 changes: 56 additions & 0 deletions scripts/config/sciworld.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
data:
total_epoch: 20
batch_size: 4
dataset_path: 'scripts/data_prepare/sciworld_data'
default_workflow_type: 'sciworld_workflow'
train_split: 'train'
eval_split: ''
format_config:
prompt_key: 'game_file'
model:
model_path: '/PATH/TO/MODEL/CHECKPOINT/'
max_prompt_tokens: 4096
max_response_tokens: 16384
checkpoint_path: 'checkpoints/sciworld_RFT'
cluster:
node_num: 1
gpu_per_node: 8
buffer:
max_retry_times: 3
max_retry_interval: 1
train_dataset:
name: sciworld_buffer
storage_type: queue
algorithm_type: ppo
path: 'sqlite:///sciworld.db'
explorer:
engine_type: vllm_async
engine_num: 2
runner_num: 32
tensor_parallel_size: 2
enable_prefix_caching: false
enforce_eager: true
dtype: bfloat16
temperature: 1.0
top_p: 1.0
top_k: -1
seed: 42
logprobs: 0
repeat_times: 8
use_ray: false
backend: 'nccl'
max_pending_requests: 32
max_waiting_steps: 4
gpu_memory_utilization: 0.7
enable_chunked_prefil: true
synchronizer:
sync_method: 'online'
sync_iteration_interval: 8
trainer:
trainer_type: 'verl'
algorithm_type: ppo
trainer_config_path: 'scripts/config/train_sciworld.yaml'
monitor:
cache_root_dir: ""
project: "sciworld"
name: "sciworld_RFT"
183 changes: 183 additions & 0 deletions scripts/config/train_sciworld.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,183 @@
data:
tokenizer: null
train_files: train_example.parquet
val_files: test_example.parquet
prompt_key: prompt
max_prompt_length: 4096
max_response_length: 16384
train_batch_size: 96
val_batch_size: null
return_raw_input_ids: False # This should be set to true when the tokenizer between policy and rm differs
return_raw_chat: False
shuffle: True
filter_overlong_prompts: False # for large-scale dataset, filtering overlong prompts could be timeconsuming. You should disable this and set `truncation='left'
truncation: error
image_key: images

actor_rollout_ref:
hybrid_engine: True
model:
path: /PATH/TO/MODEL/CHECKPOINT/
external_lib: null
override_config: { }
enable_gradient_checkpointing: True
use_remove_padding: False
actor:
strategy: fsdp # This is for backward-compatibility
ppo_mini_batch_size: 1536
# ppo_micro_batch_size: 8 # will be deprecated, use ppo_micro_batch_size_per_gpu
ppo_micro_batch_size_per_gpu: 1
use_dynamic_bsz: False
ppo_max_token_len_per_gpu: 16384 # n * ${data.max_prompt_length} + ${data.max_response_length}
grad_clip: 1.0
clip_ratio: 0.2
entropy_coeff: 0.001
use_kl_loss: True # True for GRPO
kl_loss_coef: 0.001 # for grpo
kl_loss_type: low_var_kl # for grpo
ppo_epochs: 1
shuffle: False
ulysses_sequence_parallel_size: 1 # sp size
optim:
lr: 1e-6
lr_warmup_steps_ratio: 0. # the total steps will be injected during runtime
# min_lr_ratio: null # only useful for warmup with cosine
warmup_style: constant # select from constant/cosine
total_training_steps: -1 # must be override by program
fsdp_config:
wrap_policy:
# transformer_layer_cls_to_wrap: None
min_num_params: 0
param_offload: False
optimizer_offload: False
fsdp_size: -1
ref:
fsdp_config:
param_offload: False
wrap_policy:
# transformer_layer_cls_to_wrap: None
min_num_params: 0
# log_prob_micro_batch_size: 4 # will be deprecated, use log_prob_micro_batch_size_per_gpu
log_prob_micro_batch_size_per_gpu: 1
log_prob_use_dynamic_bsz: ${actor_rollout_ref.actor.use_dynamic_bsz}
log_prob_max_token_len_per_gpu: ${actor_rollout_ref.actor.ppo_max_token_len_per_gpu}
ulysses_sequence_parallel_size: ${actor_rollout_ref.actor.ulysses_sequence_parallel_size} # sp size
rollout:
name: vllm
temperature: 1.0
top_k: -1 # 0 for hf rollout, -1 for vllm rollout
top_p: 1
use_fire_sampling: False # https://arxiv.org/abs/2410.21236
prompt_length: ${data.max_prompt_length} # not use for opensource
response_length: ${data.max_response_length}
# for vllm rollout
dtype: bfloat16 # should align with FSDP
gpu_memory_utilization: 0.4
ignore_eos: False
enforce_eager: True
free_cache_engine: True
load_format: dummy_dtensor
tensor_model_parallel_size: 1
max_num_batched_tokens: 8192
max_model_len: null
max_num_seqs: 1024
# log_prob_micro_batch_size: 8 # will be deprecated, use log_prob_micro_batch_size_per_gpu
log_prob_micro_batch_size_per_gpu: 1
log_prob_use_dynamic_bsz: ${actor_rollout_ref.actor.use_dynamic_bsz}
log_prob_max_token_len_per_gpu: ${actor_rollout_ref.actor.ppo_max_token_len_per_gpu}
disable_log_stats: True
enable_chunked_prefill: True # could get higher throughput
# for hf rollout
do_sample: True
# number of responses (i.e. num sample times)
n: 8 # should be > 1 for grpo; Currently is unused parameter

critic:
strategy: fsdp
optim:
lr: 1e-5
lr_warmup_steps_ratio: 0. # the total steps will be injected during runtime
# min_lr_ratio: null # only useful for warmup with cosine
warmup_style: constant # select from constant/cosine
total_training_steps: -1 # must be override by program
model:
path: /PATH/TO/MODEL/CHECKPOINT/
tokenizer_path: ${actor_rollout_ref.model.path}
override_config: { }
external_lib: ${actor_rollout_ref.model.external_lib}
enable_gradient_checkpointing: True
use_remove_padding: False
fsdp_config:
param_offload: False
optimizer_offload: False
wrap_policy:
# transformer_layer_cls_to_wrap: None
min_num_params: 0
fsdp_size: -1
ppo_mini_batch_size: ${actor_rollout_ref.actor.ppo_mini_batch_size}
# ppo_micro_batch_size: 8 # will be deprecated, use ppo_micro_batch_size_per_gpu
ppo_micro_batch_size_per_gpu: 1
forward_micro_batch_size: ${critic.ppo_micro_batch_size}
forward_micro_batch_size_per_gpu: ${critic.ppo_micro_batch_size_per_gpu}
use_dynamic_bsz: ${actor_rollout_ref.actor.use_dynamic_bsz}
ppo_max_token_len_per_gpu: 16384 # (${actor_rollout_ref.actor.ppo_max_token_len_per_gpu}) * 2
forward_max_token_len_per_gpu: ${critic.ppo_max_token_len_per_gpu}
ulysses_sequence_parallel_size: 1 # sp size
ppo_epochs: ${actor_rollout_ref.actor.ppo_epochs}
shuffle: ${actor_rollout_ref.actor.shuffle}
grad_clip: 1.0
cliprange_value: 0.5

reward_model:
enable: False
strategy: fsdp
model:
input_tokenizer: ${actor_rollout_ref.model.path} # set this to null if the chat template is identical
path: ~/models/FsfairX-LLaMA3-RM-v0.1
external_lib: ${actor_rollout_ref.model.external_lib}
use_remove_padding: False
fsdp_config:
min_num_params: 0
param_offload: False
fsdp_size: -1
# micro_batch_size: null # will be deprecated, use micro_batch_size_per_gpu
# micro_batch_size_per_gpu: 2 # set a number
# max_length: null
ulysses_sequence_parallel_size: 1 # sp size
use_dynamic_bsz: ${critic.use_dynamic_bsz}
forward_max_token_len_per_gpu: ${critic.forward_max_token_len_per_gpu}

custom_reward_function:
path: null
name: compute_score

algorithm:
gamma: 1.0
lam: 1.0
adv_estimator: grpo
kl_penalty: kl # how to estimate kl divergence
kl_ctrl:
type: fixed
kl_coef: 0.001

trainer:
balance_batch: True
total_epochs: 15
# total_training_steps: null
project_name: sciworld
experiment_name: sciworld_RFT
logger: [ 'wandb' ]
val_generations_to_log_to_wandb: 0
nnodes: 1
n_gpus_per_node: 2
save_freq: 1
# auto: find the last ckpt to resume. If can't find, start from scratch
resume_mode: auto # or auto or resume_path if
resume_from_path: False
test_freq: 100
critic_warmup: 0
default_hdfs_dir: null
remove_previous_ckpt_in_save: False
del_local_ckpt_after_load: False
default_local_dir: checkpoints/${trainer.project_name}/${trainer.experiment_name}
val_before_train: False
1 change: 0 additions & 1 deletion scripts/data_prepare/get_alfworld_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,6 @@ def create_dataset_files(output_dir, train_size=1024, test_size=100):
# create dataset_dict
dataset_dict = {"train": train_data, "test": test_data}

# 保存为jsonl格式
for split, data in dataset_dict.items():
output_file = os.path.join(output_dir, f"{split}.jsonl")
with open(output_file, "w") as f:
Expand Down
Loading