Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions .gitmodules
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
[submodule "3rdparty/Megatron-LM"]
path = 3rdparty/Megatron-LM-workspace/Megatron-LM
url = https://github.com/ahmadki/Megatron-LM.git
branch = ahmadki/dist_optim_non_tensor_fix
url = https://github.com/shanmugamr1992/Megatron-LM.git
branch = fixes_latest
shallow = true
[submodule "3rdparty/Megatron-Bridge"]
path = 3rdparty/Megatron-Bridge-workspace/Megatron-Bridge
Expand Down
14 changes: 11 additions & 3 deletions examples/configs/grpo_math_1B.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -146,6 +146,11 @@ policy:
moe_enable_deepep: false
moe_token_dispatcher_type: "allgather"
moe_shared_expert_overlap: false
moe_pad_experts_for_cuda_graph_inference: false
cuda_graph_impl: "local"
cuda_graph_scope: null
use_te_rng_tracker: true
inference_rng_tracker: true

optimizer:
optimizer: "adam"
Expand Down Expand Up @@ -252,13 +257,16 @@ policy:
stop_strings: null
mcore_generation_config:
buffer_size_gb: 20 # Total GPU memory (in GB) allocated for KV cache buffers
buffer_guaranteed_fraction: 0.1 # Fraction of buffer reserved for guaranteed active requests
num_cuda_graphs: 16 # Number of CUDA graphs to pre-compile for different batch sizes
block_size_tokens: 256 # Size of each KV cache block in tokens (affects memory granularity)
use_cuda_graphs_for_non_decode_steps: true # Enable CUDA graphs for prefill/context processing
enable_chunked_prefill: true # Split long prefills into chunks for better memory management
unified_memory_level: 0 # Unified memory usage level (0=disabled, higher values enable more aggressive paging)
unified_memory_level: 0 # Unified memory usage level (0=disabled, 1+=enables unified memory )
max_tokens: 16384 # Maximum number of tokens to use in a single step. Analogous to vllm's max_num_batched_tokens
enable_chunked_prefill: false
kv_cache_management_mode: "persist" # Can be "persist", "offload", or "recompute"
static_kv_memory_pointers: false # Relevant only for offload and recompute modes
materialize_only_last_token_logits: false

vllm_cfg:
async_engine: false
precision: ${policy.precision}
Expand Down
14 changes: 12 additions & 2 deletions examples/configs/grpo_math_1B_megatron.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -100,6 +100,12 @@ policy:
moe_shared_expert_overlap: false
#gives ~20% training perf speedup with sequence packing
apply_rope_fusion: True
moe_pad_experts_for_cuda_graph_inference: false
cuda_graph_impl: "local"
cuda_graph_scope: "full_iteration_inference"
use_te_rng_tracker: true
inference_rng_tracker: true
batch_invariant_mode: false

optimizer:
optimizer: "adam"
Expand All @@ -125,6 +131,7 @@ policy:
clip_grad: ${policy.max_grad_norm}

scheduler:
override_opt_param_scheduler: true
start_weight_decay: ${policy.megatron_cfg.optimizer.weight_decay}
end_weight_decay: ${policy.megatron_cfg.optimizer.weight_decay}
weight_decay_incr_style: "constant"
Expand All @@ -151,9 +158,12 @@ policy:
num_cuda_graphs: 16 # Number of CUDA graphs to pre-compile for different batch sizes
block_size_tokens: 256 # Size of each KV cache block in tokens (affects memory granularity)
use_cuda_graphs_for_non_decode_steps: true # Enable CUDA graphs for prefill/context processing
enable_chunked_prefill: false # Split long prefills into chunks for better memory management
unified_memory_level: 0 # Unified memory usage level (0=disabled, higher values enable more aggressive paging)
unified_memory_level: 0 # Unified memory usage level (0=disabled, 1+=enables unified memory )
max_tokens: 16384 # Maximum number of tokens to use in a single step. Analogous to vllm's max_num_batched_tokens
enable_chunked_prefill: false
kv_cache_management_mode: "persist" # Can be "persist", "offload", or "recompute"
static_kv_memory_pointers: false # Relevant only for offload and recompute modes
materialize_only_last_token_logits: false

vllm_cfg:
tensor_parallel_size: 1
Expand Down
3 changes: 2 additions & 1 deletion examples/configs/grpo_math_8B_megatron.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ policy:
train_global_batch_size: 512
train_micro_batch_size: 1
generation_batch_size: 32 # Only used when generating using HF backend
logprob_batch_size: 4
logprob_batch_size: ${policy.train_micro_batch_size}
max_total_sequence_length: 4096
precision: "bfloat16"

Expand Down Expand Up @@ -48,6 +48,7 @@ policy:
params_dtype: "float32"

scheduler:
override_opt_param_scheduler: true
start_weight_decay: ${policy.megatron_cfg.optimizer.weight_decay}
end_weight_decay: ${policy.megatron_cfg.optimizer.weight_decay}
weight_decay_incr_style: "constant"
Expand Down
Loading
Loading