diff --git a/apps/grpo/qwen3_1_7b.yaml b/apps/grpo/qwen3_1_7b.yaml index 53eec5cfb..800d2e973 100644 --- a/apps/grpo/qwen3_1_7b.yaml +++ b/apps/grpo/qwen3_1_7b.yaml @@ -3,7 +3,7 @@ # Global configuration group_size: 8 -batch_size: 16 +local_batch_size: 16 # per-device batch size max_req_tokens: 512 max_res_tokens: 512 model: "Qwen/Qwen3-1.7B" @@ -56,7 +56,7 @@ trainer: lr_scheduler: warmup_steps: 1 training: - local_batch_size: ${batch_size} + local_batch_size: ${local_batch_size} seq_len: 2048 max_norm: 1.0 steps: 1000000 @@ -85,7 +85,7 @@ trainer: # Replay buffer configuration replay_buffer: - batch_size: ${batch_size} + batch_size: ${local_batch_size} max_policy_age: ${off_by_n} dp_size: ${trainer.parallelism.data_parallel_shard_degree} # Must equal trainer DP degree diff --git a/apps/grpo/qwen3_32b.yaml b/apps/grpo/qwen3_32b.yaml index ca88b349a..8100a988b 100644 --- a/apps/grpo/qwen3_32b.yaml +++ b/apps/grpo/qwen3_32b.yaml @@ -4,7 +4,7 @@ # Global configuration group_size: 2 -batch_size: 8 +local_batch_size: 8 # per-device batch size max_req_tokens: 512 max_res_tokens: 512 model: "Qwen/Qwen3-32B" @@ -59,7 +59,7 @@ trainer: lr_scheduler: warmup_steps: 1 training: - local_batch_size: ${batch_size} + local_batch_size: ${local_batch_size} seq_len: 2048 max_norm: 1.0 steps: 1000000 @@ -87,7 +87,7 @@ trainer: # Replay buffer configuration replay_buffer: - batch_size: ${batch_size} + batch_size: ${local_batch_size} max_policy_age: ${off_by_n} # dp_size: ${trainer.parallelism.data_parallel_shard_degree} # Must equal trainer DP degree dp_size: 8 diff --git a/apps/grpo/qwen3_8b.yaml b/apps/grpo/qwen3_8b.yaml index c46ee0620..7f183870d 100644 --- a/apps/grpo/qwen3_8b.yaml +++ b/apps/grpo/qwen3_8b.yaml @@ -3,7 +3,7 @@ # Global configuration group_size: 8 -batch_size: 16 +local_batch_size: 16 # per-device batch size max_req_tokens: 512 max_res_tokens: 512 model: "Qwen/Qwen3-8B" @@ -55,7 +55,7 @@ trainer: lr_scheduler: warmup_steps: 1 training: - local_batch_size: ${batch_size} + local_local_batch_size: ${local_batch_size} seq_len: 2048 max_norm: 1.0 steps: 1000000 @@ -84,7 +84,7 @@ trainer: # Replay buffer configuration replay_buffer: - batch_size: ${batch_size} + local_batch_size: ${local_batch_size} max_policy_age: ${off_by_n} # This should match the dp_size of TorchTitan # Here it's set explicitly to 2, because we've set