diff --git a/apps/grpo/qwen3_32b.yaml b/apps/grpo/qwen3_32b.yaml index 8100a988b..5729517da 100644 --- a/apps/grpo/qwen3_32b.yaml +++ b/apps/grpo/qwen3_32b.yaml @@ -3,10 +3,10 @@ # NOTE - This has not been tested for correctness yet! All testing so far has been only for infrastructure stability # Global configuration -group_size: 2 -local_batch_size: 8 # per-device batch size -max_req_tokens: 512 -max_res_tokens: 512 +group_size: 16 +local_batch_size: 32 # per-device batch size +max_req_tokens: 1024 +max_res_tokens: 1024 model: "Qwen/Qwen3-32B" off_by_n: 1 # Off by one by default @@ -14,7 +14,7 @@ provisioner: launcher: slurm # Main loop configuration -rollout_threads: 1 # Recommended to set equal to policy.num_replicas +rollout_threads: 32 # make this 4x the number of policy replicas seems to work well # Observability configuration metric_logging: @@ -69,8 +69,8 @@ trainer: enable: false parallelism: data_parallel_replicate_degree: 1 - data_parallel_shard_degree: -1 - tensor_parallel_degree: 1 + data_parallel_shard_degree: 1 + tensor_parallel_degree: 8 pipeline_parallel_degree: 1 context_parallel_degree: 1 expert_parallel_degree: 1 @@ -90,7 +90,7 @@ replay_buffer: batch_size: ${local_batch_size} max_policy_age: ${off_by_n} # dp_size: ${trainer.parallelism.data_parallel_shard_degree} # Must equal trainer DP degree - dp_size: 8 + dp_size: 1 # Reference model configuration ref_model: @@ -119,7 +119,7 @@ ref_model: services: policy: procs: ${policy.engine_config.tensor_parallel_size} - num_replicas: 1 + num_replicas: 4 hosts: 1 with_gpus: true ref_model: diff --git a/apps/mast/qwen3_14b_mast.yaml b/apps/mast/qwen3_14b_mast.yaml index 484a71538..1d5300838 100644 --- a/apps/mast/qwen3_14b_mast.yaml +++ b/apps/mast/qwen3_14b_mast.yaml @@ -3,7 +3,7 @@ # Global configuration group_size: 8 -batch_size: 16 +local_batch_size: 16 # per-device batch size max_req_tokens: 512 max_res_tokens: 512 model: "Qwen/Qwen3-14B" @@ -61,7 +61,7 @@ trainer: lr_scheduler: warmup_steps: 1 training: - local_batch_size: ${batch_size} + local_batch_size: ${local_batch_size} seq_len: 2048 max_norm: 1.0 steps: 1000000 @@ -95,7 +95,7 @@ trainer: # Replay buffer configuration replay_buffer: - batch_size: ${batch_size} + batch_size: ${local_batch_size} max_policy_age: ${off_by_n} dp_size: ${trainer.parallelism.data_parallel_shard_degree} # Must equal trainer DP degree diff --git a/apps/mast/qwen3_1_7b_mast.yaml b/apps/mast/qwen3_1_7b_mast.yaml index 58d879579..92d27da16 100644 --- a/apps/mast/qwen3_1_7b_mast.yaml +++ b/apps/mast/qwen3_1_7b_mast.yaml @@ -3,7 +3,7 @@ # Global configuration group_size: 8 -batch_size: 16 +local_batch_size: 16 # per-device batch size max_req_tokens: 512 max_res_tokens: 512 model: "Qwen/Qwen3-1.7B" @@ -61,7 +61,7 @@ trainer: lr_scheduler: warmup_steps: 1 training: - local_batch_size: ${batch_size} + local_batch_size: ${local_batch_size} seq_len: 2048 max_norm: 1.0 steps: 1000000 @@ -95,7 +95,7 @@ trainer: # Replay buffer configuration replay_buffer: - batch_size: ${batch_size} + batch_size: ${local_batch_size} max_policy_age: ${off_by_n} dp_size: ${trainer.parallelism.data_parallel_shard_degree} # Must equal trainer DP degree diff --git a/apps/mast/qwen3_32b_mast.yaml b/apps/mast/qwen3_32b_mast.yaml index 47368becd..3fa79b955 100644 --- a/apps/mast/qwen3_32b_mast.yaml +++ b/apps/mast/qwen3_32b_mast.yaml @@ -2,10 +2,10 @@ # >>> python -m apps.mast.main --config apps/mast/qwen3_1_7b_mast.yaml # Global configuration -group_size: 8 -batch_size: 16 -max_req_tokens: 512 -max_res_tokens: 512 +group_size: 16 +local_batch_size: 32 # per-device batch size +max_req_tokens: 1024 +max_res_tokens: 1024 model: "Qwen/Qwen3-32B" off_by_n: 1 # Off by one by default launcher: mast @@ -13,7 +13,7 @@ job_name: forge-qwen3-32b checkpoint_folder: /mnt/wsfuse/teamforge/forge_runs/ # Main loop configuration -rollout_threads: ${services.policy.num_replicas} # Recommended to set equal to policy.num_replicas +rollout_threads: 32 # make this 4x the number of policy replicas seems to work well # Observability configuration metric_logging: @@ -61,7 +61,7 @@ trainer: lr_scheduler: warmup_steps: 1 training: - local_batch_size: ${batch_size} + local_batch_size: ${local_batch_size} seq_len: 2048 max_norm: 1.0 steps: 1000000 @@ -71,8 +71,8 @@ trainer: enable: false parallelism: data_parallel_replicate_degree: 1 - data_parallel_shard_degree: 8 - tensor_parallel_degree: 1 + data_parallel_shard_degree: 1 + tensor_parallel_degree: 8 pipeline_parallel_degree: 1 context_parallel_degree: 1 expert_parallel_degree: 1 @@ -95,7 +95,7 @@ trainer: # Replay buffer configuration replay_buffer: - batch_size: ${batch_size} + batch_size: ${local_batch_size} max_policy_age: ${off_by_n} dp_size: ${trainer.parallelism.data_parallel_shard_degree} # Must equal trainer DP degree @@ -129,13 +129,13 @@ ref_model: services: policy: procs: ${policy.engine_config.tensor_parallel_size} - num_replicas: 2 + num_replicas: 4 with_gpus: true mesh_name: policy hosts: 1 ref_model: - procs: 4 - num_replicas: 2 + procs: ${ref_model.parallelism.tensor_parallel_degree} + num_replicas: 1 with_gpus: true mesh_name: ref_model hosts: 1 diff --git a/apps/mast/qwen3_4b_mast.yaml b/apps/mast/qwen3_4b_mast.yaml index 92119055a..a7e44e069 100644 --- a/apps/mast/qwen3_4b_mast.yaml +++ b/apps/mast/qwen3_4b_mast.yaml @@ -3,7 +3,7 @@ # Global configuration group_size: 8 -batch_size: 16 +local_batch_size: 16 # per-device batch size max_req_tokens: 512 max_res_tokens: 512 model: "Qwen/Qwen3-4B" @@ -61,7 +61,7 @@ trainer: lr_scheduler: warmup_steps: 1 training: - local_batch_size: ${batch_size} + local_batch_size: ${local_batch_size} seq_len: 2048 max_norm: 1.0 steps: 1000000 @@ -95,7 +95,7 @@ trainer: # Replay buffer configuration replay_buffer: - batch_size: ${batch_size} + batch_size: ${local_batch_size} max_policy_age: ${off_by_n} dp_size: ${trainer.parallelism.data_parallel_shard_degree} # Must equal trainer DP degree diff --git a/apps/mast/qwen3_8b_mast.yaml b/apps/mast/qwen3_8b_mast.yaml index 7f2f99694..953f809b7 100644 --- a/apps/mast/qwen3_8b_mast.yaml +++ b/apps/mast/qwen3_8b_mast.yaml @@ -3,7 +3,7 @@ # Global configuration group_size: 8 -batch_size: 16 +local_batch_size: 16 # per-device batch size max_req_tokens: 512 max_res_tokens: 512 model: "Qwen/Qwen3-8B" @@ -61,7 +61,7 @@ trainer: lr_scheduler: warmup_steps: 1 training: - local_batch_size: ${batch_size} + local_batch_size: ${local_batch_size} seq_len: 2048 max_norm: 1.0 steps: 1000000 @@ -95,7 +95,7 @@ trainer: # Replay buffer configuration replay_buffer: - batch_size: ${batch_size} + batch_size: ${local_batch_size} max_policy_age: ${off_by_n} dp_size: ${trainer.parallelism.data_parallel_shard_degree} # Must equal trainer DP degree