Skip to content
Open
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
259 changes: 259 additions & 0 deletions examples/configs/audio_grpo_3B_megatron.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,259 @@
grpo:
num_prompts_per_step: 8
num_generations_per_prompt: 4
max_rollout_turns: 1
max_num_epochs: 1
max_num_steps: 500
normalize_rewards: true
use_leave_one_out_baseline: true
val_period: 10
val_at_start: false
val_at_end: false
overlong_filtering: false
max_val_samples: 32
val_batch_size: 32
seed: 42
use_dynamic_sampling: false
batch_multiplier: 1
reward_shaping:
enabled: false
overlong_buffer_length: 512
overlong_buffer_penalty: 1
max_response_length: ${policy.max_total_sequence_length}
# Advantage Estimator Configuration
# Options: "grpo" (default) or "reinforce_plus_plus"
adv_estimator:
name: "grpo" # Use "reinforce_plus_plus" for Reinforce++ estimator
normalize_rewards: ${grpo.normalize_rewards}
use_leave_one_out_baseline: ${grpo.use_leave_one_out_baseline}
minus_baseline: true # Reinforce++-baseline specific: subtract per-prompt mean baseline
reward_scaling:
enabled: false
source_min: 0.0
source_max: 1.0
target_min: 0.0
target_max: 1.0
async_grpo:
enabled: false
max_trajectory_age_steps: 1
seq_logprob_error_threshold: null
loss_fn:
reference_policy_kl_penalty: 0.01
# Can be set to k1, k2, k3
# For more details, see http://joschu.net/blog/kl-approx.html
reference_policy_kl_type: "k3"
kl_input_clamp_value: 20.0
kl_output_clamp_value: 10.0
ratio_clip_min: 0.2
ratio_clip_max: 0.2
ratio_clip_c: null
use_on_policy_kl_approximation: false
use_importance_sampling_correction: false
truncated_importance_sampling_ratio: null
token_level_loss: true
force_on_policy_ratio: false
checkpointing:
enabled: true
checkpoint_dir: results/audio_grpo_3B_megatron
metric_name: val:accuracy
higher_is_better: true
keep_top_k: 3
save_period: 100
checkpoint_must_save_by: null
policy:
model_name: /workspace_yuekai/HF/Qwen2.5-Omni-3B
tokenizer:
name: ${policy.model_name}
train_global_batch_size: 32
train_micro_batch_size: 1
generation_batch_size: 32
logprob_batch_size: 4
max_total_sequence_length: 2048
precision: bfloat16
offload_optimizer_for_logprob: false
dtensor_cfg:
_v2: true
enabled: false
cpu_offload: false
sequence_parallel: false
activation_checkpointing: false
tensor_parallel_size: 1
context_parallel_size: 1
custom_parallel_plan: null
dynamic_batching:
enabled: false
train_mb_tokens: ${mul:${policy.max_total_sequence_length}, ${policy.train_micro_batch_size}}
logprob_mb_tokens: ${mul:${policy.max_total_sequence_length}, ${policy.logprob_batch_size}}
sequence_length_round: 64
make_sequence_length_divisible_by: ${policy.megatron_cfg.tensor_model_parallel_size}
max_grad_norm: 1.0
sequence_packing:
enabled: false
train_mb_tokens: ${mul:${policy.max_total_sequence_length}, ${policy.train_micro_batch_size}}
logprob_mb_tokens: ${mul:${policy.max_total_sequence_length}, ${policy.logprob_batch_size}}
algorithm: modified_first_fit_decreasing
sequence_length_round: 64
scheduler:
- name: torch.optim.lr_scheduler.LinearLR
kwargs:
start_factor: 0.1
end_factor: 1.0
total_iters: 50
- name: torch.optim.lr_scheduler.ConstantLR
kwargs:
factor: 1.0
total_iters: 10000000000
- milestones:
- 50
generation:
backend: vllm
max_new_tokens: 1024
temperature: 1.0
top_p: 1.0
top_k: null
stop_token_ids: null
stop_strings: null
vllm_cfg:
async_engine: false
precision: ${policy.precision}
kv_cache_dtype: "auto"
tensor_parallel_size: 1
pipeline_parallel_size: 1
expert_parallel_size: 1
gpu_memory_utilization: 0.6
max_model_len: ${policy.max_total_sequence_length}
enforce_eager: false
enable_expert_parallel: false
# Audio/multimodal models require tokenizer to be initialized before generation
skip_tokenizer_init: False
limit_mm_per_prompt:
audio: 1
colocated:
enabled: true
resources:
gpus_per_node: null
num_nodes: null
megatron_cfg:
enabled: true
empty_unused_memory_level: 1
activation_checkpointing: false
converter_type: Qwen2_5OmniForConditionalGeneration
tensor_model_parallel_size: 1
expert_tensor_parallel_size: 1
expert_model_parallel_size: 1
pipeline_model_parallel_size: 1
num_layers_in_first_pipeline_stage: null
num_layers_in_last_pipeline_stage: null
context_parallel_size: 1
pipeline_dtype: ${policy.precision}
sequence_parallel: false
freeze_moe_router: true
moe_router_dtype: fp64
moe_router_load_balancing_type: none
moe_router_bias_update_rate: 0.0
moe_permute_fusion: false
apply_rope_fusion: false
bias_activation_fusion: True
defer_fp32_logits: False
moe_per_layer_logging: False
moe_enable_deepep: false
moe_token_dispatcher_type: "allgather"
moe_shared_expert_overlap: false
peft:
enabled: false
target_modules: []
exclude_modules: []
dim: 8
alpha: 32
dropout: 0.0
dropout_position: "post"
lora_A_init_method: "xavier"
lora_B_init_method: "zero"
a2a_experimental: false
lora_dtype: null
optimizer:
optimizer: adam
lr: 2.0e-07
min_lr: 2.0e-07
weight_decay: 0.01
bf16: true
fp16: false
params_dtype: float32
adam_beta1: 0.9
adam_beta2: 0.999
adam_eps: 1.0e-08
sgd_momentum: 0.9
use_distributed_optimizer: true
use_precision_aware_optimizer: true
clip_grad: ${policy.max_grad_norm}
optimizer_cpu_offload: false
optimizer_offload_fraction: 0.0
scheduler:
start_weight_decay: ${policy.megatron_cfg.optimizer.weight_decay}
end_weight_decay: ${policy.megatron_cfg.optimizer.weight_decay}
weight_decay_incr_style: constant
lr_decay_style: constant
lr_decay_iters: 1000
lr_warmup_iters: 50
lr_warmup_init: 2.0e-08
distributed_data_parallel_config:
grad_reduce_in_fp32: false
overlap_grad_reduce: false
overlap_param_gather: true
use_custom_fsdp: false
data_parallel_sharding_strategy: optim_grads_params
fp8_cfg:
enabled: false
fp8: "e4m3"
fp8_recipe: "blockwise"
fp8_param: false
data:
max_input_seq_length: ${policy.max_total_sequence_length}
shuffle: true
num_workers: 1

# use multiple dataloader for train
use_multiple_dataloader: false

# dataset
train:
dataset_name: avqa
split: train
validation:
dataset_name: avqa
split: validation
# default settings for all datasets
default:
prompt_file: examples/prompts/avqa_cot.txt
system_prompt_file: null
processor: "vlm_hf_data_processor"
env_name: "avqa"
env:
avqa:
num_workers: 8
reward_functions:
- name: format
weight: 0.2
- name: exact_alnum
weight: 0.8
logger:
log_dir: logs
num_val_samples_to_print: 0
wandb_enabled: true
tensorboard_enabled: true
swanlab_enabled: false
mlflow_enabled: false
monitor_gpus: false
wandb:
project: grpo-dev
name: audio-grpo-3b-megatron
swanlab:
project: grpo-dev
name: audio-grpo-3b-megatron
tensorboard: {}
gpu_monitoring:
collection_interval: 10
flush_interval: 10
cluster:
gpus_per_node: 8
num_nodes: 1
Loading
Loading