Skip to content

Commit 8ef0de9

Browse files
yfwShanmugam Ramasamy
andauthored
chore: bump mcore and mbridge (#1902)
Signed-off-by: Yi-Fu Wu <yifu.wu@gmail.com> Co-authored-by: Shanmugam Ramasamy <shanmugamr@cw-dfw-cs-001-login-01.cm.cluster>
1 parent 2d453b3 commit 8ef0de9

File tree

12 files changed

+72
-49
lines changed

12 files changed

+72
-49
lines changed

.gitmodules

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
[submodule "3rdparty/Megatron-LM"]
22
path = 3rdparty/Megatron-LM-workspace/Megatron-LM
3-
url = https://github.com/terrykong/Megatron-LM.git
4-
branch = yuya/nemo-rl-use-dev
3+
url = https://github.com/yaoyu-33/Megatron-LM.git
4+
branch = main
55
shallow = true
66
[submodule "3rdparty/Megatron-Bridge"]
77
path = 3rdparty/Megatron-Bridge-workspace/Megatron-Bridge
Submodule Megatron-Bridge updated 384 files

3rdparty/Megatron-Bridge-workspace/setup.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -26,8 +26,9 @@
2626
bridge_package_name = "megatron.bridge"
2727

2828
CACHED_DEPENDENCIES = [
29-
"transformers>=4.57.1",
29+
"transformers<5.0.0",
3030
"datasets",
31+
"accelerate",
3132
"omegaconf>=2.3.0",
3233
"tensorboard>=2.19.0",
3334
"typing-extensions",
@@ -40,7 +41,7 @@
4041
"hydra-core>1.3,<=1.3.2",
4142
"megatron-core[dev,mlm]>=0.15.0a0,<0.17.0",
4243
"qwen-vl-utils",
43-
"transformer-engine[pytorch]>=2.9.0a0,<2.10.0",
44+
"transformer-engine[pytorch]>=2.10.0a0,<2.12.0",
4445
"mamba-ssm",
4546
"nvidia-resiliency-ext",
4647
"causal-conv1d",
Submodule Megatron-LM updated 966 files

3rdparty/Megatron-LM-workspace/setup.py

Lines changed: 9 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -44,30 +44,31 @@
4444
CACHED_DEPENDENCIES = [
4545
# Default dependencies from pyproject.toml
4646
"torch",
47-
"numpy<2.0.0",
47+
"numpy",
4848
"packaging>=24.2",
4949
# Dev dependencies from pyproject.toml
50-
"nvidia-modelopt[torch]>=0.33.0a0,<0.34.0; sys_platform != 'darwin'",
51-
"transformer-engine[pytorch]>=2.9.0a0,<2.10.0",
52-
"nvidia-resiliency-ext>=0.4.0a0,<0.5.0",
50+
"nvidia-modelopt[torch]; sys_platform != 'darwin'",
51+
"transformer-engine[pytorch,core_cu13]>=2.9.0a0,<2.12.0",
52+
"nvidia-resiliency-ext",
5353
"tqdm",
5454
"einops~=0.8",
5555
"tensorstore~=0.1,!=0.1.46,!=0.1.72",
5656
"nvtx~=0.2",
5757
"multi-storage-client~=0.27",
5858
"opentelemetry-api~=1.33.1",
59-
"setuptools<80.0.0",
6059
"mamba-ssm~=2.2",
6160
"causal-conv1d~=1.5",
61+
"flash-linear-attention~=0.3.2",
6262
"nv-grouped-gemm~=1.1",
6363
"megatron-energon[av_decode]~=6.0",
64-
"av<16.0.0",
65-
"flashinfer-python",
64+
"av",
65+
"flashinfer-python~=0.5.0",
6666
"wget",
6767
"onnxscript",
68-
"flash-linear-attention~=0.3.2",
6968
# VCS dependency - must match pyproject.toml [tool.uv.sources]
7069
"emerging_optimizers @ git+https://github.com/NVIDIA-NeMo/Emerging-Optimizers.git@v0.1.0",
70+
"datasets",
71+
"fastapi~=0.50",
7172
]
7273

7374

examples/configs/grpo_math_1B_megatron.yaml

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -148,11 +148,10 @@ policy:
148148
top_k: null
149149
mcore_generation_config:
150150
buffer_size_gb: 20 # Total GPU memory (in GB) allocated for KV cache buffers
151-
buffer_guaranteed_fraction: 0.1 # Fraction of buffer reserved for guaranteed active requests
152151
num_cuda_graphs: 16 # Number of CUDA graphs to pre-compile for different batch sizes
153152
block_size_tokens: 256 # Size of each KV cache block in tokens (affects memory granularity)
154153
use_cuda_graphs_for_non_decode_steps: true # Enable CUDA graphs for prefill/context processing
155-
enable_chunked_prefill: true # Split long prefills into chunks for better memory management
154+
enable_chunked_prefill: false # Split long prefills into chunks for better memory management
156155
unified_memory_level: 0 # Unified memory usage level (0=disabled, higher values enable more aggressive paging)
157156
max_tokens: 16384 # Maximum number of tokens to use in a single step. Analogous to vllm's max_num_batched_tokens
158157

nemo_rl/models/megatron/config.py

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -25,8 +25,6 @@
2525
class MegatronGenerationConfig(TypedDict):
2626
# Total GPU memory (in GB) allocated for KV cache buffers
2727
buffer_size_gb: int
28-
# Fraction of buffer reserved for guaranteed active requests
29-
buffer_guaranteed_fraction: float
3028
# Number of CUDA graphs to pre-compile for different batch sizes
3129
num_cuda_graphs: int
3230
# Size of each KV cache block in tokens (affects memory granularity)

nemo_rl/models/megatron/setup.py

Lines changed: 14 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -51,6 +51,7 @@
5151
from megatron.bridge.utils.instantiate_utils import InstantiationMode
5252
from megatron.bridge.utils.vocab_utils import calculate_padded_vocab_size
5353
from megatron.core import parallel_state
54+
from megatron.core.process_groups_config import ProcessGroupCollection
5455
from megatron.core.transformer import MegatronModule
5556
from megatron.core.transformer.module import Float16Module
5657
from megatron.core.transformer.transformer_config import TransformerConfig
@@ -663,12 +664,15 @@ def setup_model_and_optimizer(
663664
checkpointing_context = init_checkpointing_context(megatron_cfg.checkpoint)
664665

665666
# Tokenizer
667+
if megatron_cfg.tokenizer.hf_tokenizer_kwargs is None:
668+
megatron_cfg.tokenizer.hf_tokenizer_kwargs = {}
669+
megatron_cfg.tokenizer.hf_tokenizer_kwargs["trust_remote_code"] = True
670+
megatron_cfg.tokenizer.hf_tokenizer_kwargs["use_fast"] = True
666671
build_tokenizer(
667672
megatron_cfg.tokenizer,
668673
make_vocab_size_divisible_by=megatron_cfg.model.make_vocab_size_divisible_by
669674
// megatron_cfg.model.tensor_model_parallel_size,
670675
tensor_model_parallel_size=megatron_cfg.model.tensor_model_parallel_size,
671-
trust_remote_code=True,
672676
)
673677
assert megatron_cfg.model.vocab_size, "vocab size must be specified in model config"
674678

@@ -731,6 +735,8 @@ def composed_peft_hook(model: list[MegatronModule]) -> list[MegatronModule]:
731735
pre_wrap_hook.extend([composed_peft_hook])
732736

733737
# Model, optimizer, and learning rate.
738+
pg_collection = ProcessGroupCollection.use_mpu_process_groups()
739+
setattr(megatron_cfg.model, "_pg_collection", pg_collection)
734740
model = get_model(
735741
megatron_cfg.model,
736742
megatron_cfg.ddp,
@@ -739,6 +745,7 @@ def composed_peft_hook(model: list[MegatronModule]) -> list[MegatronModule]:
739745
data_parallel_random_init=megatron_cfg.rng.data_parallel_random_init,
740746
pre_wrap_hook=pre_wrap_hook,
741747
mixed_precision_wrapper=mixed_precision_wrapper,
748+
pg_collection=pg_collection,
742749
)
743750
if load_optimizer:
744751
optimizer, scheduler = setup_optimizer(
@@ -872,6 +879,7 @@ def setup_reference_model_state(
872879
overlap_param_gather_with_optimizer_step=megatron_cfg.optimizer.overlap_param_gather_with_optimizer_step,
873880
pre_wrap_hook=megatron_cfg.rng.data_parallel_random_init,
874881
mixed_precision_wrapper=ref_mixed_precision_wrapper,
882+
pg_collection=ProcessGroupCollection.use_mpu_process_groups(),
875883
)
876884

877885
print("Loading the Reference Model")
@@ -925,19 +933,23 @@ def finalize_megatron_setup(
925933
megatron_cfg.ddp,
926934
optimizer,
927935
align_grad_reduce=megatron_cfg.dist.align_grad_reduce,
936+
pg_collection=ProcessGroupCollection.use_mpu_process_groups(),
928937
)
929938

930939
tokenizer_config = TokenizerConfig(
931940
tokenizer_type="HuggingFaceTokenizer",
932941
tokenizer_model=hf_model_name,
942+
hf_tokenizer_kwargs={
943+
"trust_remote_code": True,
944+
"use_fast": True,
945+
},
933946
)
934947

935948
megatron_tokenizer = build_tokenizer(
936949
tokenizer_config,
937950
make_vocab_size_divisible_by=megatron_cfg.model.make_vocab_size_divisible_by
938951
// config["megatron_cfg"]["tensor_model_parallel_size"],
939952
tensor_model_parallel_size=config["megatron_cfg"]["tensor_model_parallel_size"],
940-
trust_remote_code=True,
941953
)
942954

943955
dp_size = worker_sharding_annotations.get_axis_size("data_parallel")

nemo_rl/models/policy/workers/megatron_policy_worker.py

Lines changed: 18 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,7 @@
2727
maybe_finalize_async_save,
2828
save_checkpoint,
2929
)
30+
from megatron.bridge.training.utils.pg_utils import get_pg_collection
3031
from megatron.bridge.training.utils.train_utils import (
3132
logical_and_across_model_parallel_group,
3233
reduce_max_stat_across_model_parallel_group,
@@ -415,18 +416,20 @@ def train(
415416
else:
416417
update_successful, grad_norm, num_zeros_in_grad = (True, 0.0, 0.0)
417418

419+
pg_collection = get_pg_collection(self.model)
420+
418421
# when freezing sub-models we may have a mixture of successful and unsucessful ranks,
419422
# so we must gather across mp ranks
420423
update_successful = logical_and_across_model_parallel_group(
421-
update_successful
424+
update_successful, mp_group=pg_collection.mp
422425
)
423426
# grad_norm and num_zeros_in_grad will be None on ranks without trainable params,
424427
# so we must gather across mp ranks
425428
grad_norm: float = reduce_max_stat_across_model_parallel_group(
426-
grad_norm
429+
grad_norm, mp_group=pg_collection.mp
427430
)
428431
num_zeros_in_grad: float = reduce_max_stat_across_model_parallel_group(
429-
num_zeros_in_grad
432+
num_zeros_in_grad, mp_group=pg_collection.mp
430433
)
431434

432435
if update_successful:
@@ -1036,9 +1039,6 @@ def generate(
10361039
]
10371040
enable_chunked_prefill = mcore_generation_config["enable_chunked_prefill"]
10381041
unified_memory_level = mcore_generation_config["unified_memory_level"]
1039-
buffer_guaranteed_fraction = mcore_generation_config[
1040-
"buffer_guaranteed_fraction"
1041-
]
10421042
max_tokens = mcore_generation_config["max_tokens"]
10431043

10441044
model_config = self.model.config
@@ -1050,7 +1050,6 @@ def generate(
10501050
kv_channels=model_config.kv_channels,
10511051
num_attention_heads=model_config.num_query_groups,
10521052
max_sequence_length=self.cfg["generation"]["max_new_tokens"],
1053-
buffer_guaranteed_fraction=buffer_guaranteed_fraction,
10541053
buffer_size_gb=buffer_size_gb,
10551054
materialize_only_last_token_logits=False,
10561055
num_cuda_graphs=num_cuda_graphs,
@@ -1061,7 +1060,7 @@ def generate(
10611060
use_cuda_graphs_for_non_decode_steps=use_cuda_graphs_for_non_decode_steps,
10621061
use_flashinfer_fused_rope=False,
10631062
unified_memory_level=unified_memory_level,
1064-
max_tokens_override=max_tokens,
1063+
max_tokens=max_tokens,
10651064
)
10661065
inference_wrapped_model = GPTInferenceWrapper(
10671066
self.model, inference_wrapper_config, dynamic_context
@@ -1134,23 +1133,27 @@ def generate(
11341133

11351134
result = []
11361135
while dynamic_engine.has_unfinished_requests():
1137-
result_step = dynamic_engine.step_modern(verbose=False)
1138-
finished_requests = result_step.get("finished_requests", [])
1139-
for finished_request in finished_requests:
1140-
result.append(finished_request)
1136+
result_step = dynamic_engine.step_modern()
1137+
result.extend(result_step["finished_request_records"])
11411138

11421139
# Sort results by request_id to maintain original batch order
11431140
result.sort(key=lambda x: x.request_id)
11441141

11451142
out = {
1146-
"tokens": [x.prompt_tokens.tolist() + x.generated_tokens for x in result],
1147-
"logprobs": [x.prompt_log_probs + x.generated_log_probs for x in result],
1143+
"tokens": [
1144+
x.requests[0].prompt_tokens.tolist() + x.requests[0].generated_tokens
1145+
for x in result
1146+
],
1147+
"logprobs": [
1148+
x.requests[0].prompt_log_probs + x.requests[0].generated_log_probs
1149+
for x in result
1150+
],
11481151
}
11491152

11501153
input_lengths = data["input_lengths"]
11511154
# pad the out "tokens" and "logprobs" and make them into tensors from lists
11521155
batch_size = data["input_ids"].size(0)
1153-
max_gen_seq_len = max([len(x.generated_tokens) for x in result])
1156+
max_gen_seq_len = max([len(x.requests[0].generated_tokens) for x in result])
11541157
padded_input_length = input_ids.size(1)
11551158

11561159
max_seq_len = padded_input_length + max_gen_seq_len

tests/unit/models/megatron/test_megatron_setup.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -920,6 +920,7 @@ def test_reinitialize_parallel_state_after_import(
920920
class TestSetupModelAndOptimizer:
921921
"""Tests for setup_model_and_optimizer function."""
922922

923+
@patch("nemo_rl.models.megatron.setup.ProcessGroupCollection")
923924
@patch("nemo_rl.models.megatron.setup.GlobalState")
924925
@patch("nemo_rl.models.megatron.setup.initialize_megatron")
925926
@patch("nemo_rl.models.megatron.setup.set_jit_fusion_options")
@@ -946,6 +947,7 @@ def test_setup_with_param_sync_and_frozen_moe_router(
946947
mock_set_jit,
947948
mock_init_megatron,
948949
mock_global_state,
950+
mock_pg_collection,
949951
):
950952
"""Test setup_model_and_optimizer with MoE router freezing."""
951953
from nemo_rl.models.megatron.setup import setup_model_and_optimizer
@@ -1006,6 +1008,7 @@ def test_setup_with_param_sync_and_frozen_moe_router(
10061008
class TestSetupReferenceModelState:
10071009
"""Tests for setup_reference_model_state function."""
10081010

1011+
@patch("nemo_rl.models.megatron.setup.ProcessGroupCollection")
10091012
@patch("nemo_rl.models.megatron.setup.init_checkpointing_context")
10101013
@patch("nemo_rl.models.megatron.setup.GlobalState")
10111014
@patch("nemo_rl.models.megatron.setup.get_model")
@@ -1019,6 +1022,7 @@ def test_setup_reference_model(
10191022
mock_get_model,
10201023
mock_global_state,
10211024
mock_init_ckpt_context,
1025+
mock_pg_collection,
10221026
capsys,
10231027
):
10241028
"""Test setup_reference_model_state when checkpoint exists."""
@@ -1075,6 +1079,7 @@ def test_setup_reference_model(
10751079
class TestFinalizeMegatronSetup:
10761080
"""Tests for finalize_megatron_setup function."""
10771081

1082+
@patch("nemo_rl.models.megatron.setup.ProcessGroupCollection")
10781083
@patch("nemo_rl.models.megatron.setup._update_model_config_funcs")
10791084
@patch("nemo_rl.models.megatron.setup.build_tokenizer")
10801085
@patch("nemo_rl.models.megatron.setup.AutoBridge")
@@ -1083,6 +1088,7 @@ def test_basic_finalize_setup(
10831088
mock_auto_bridge,
10841089
mock_build_tokenizer,
10851090
mock_update_model_config,
1091+
mock_pg_collection,
10861092
):
10871093
"""Test basic finalize_megatron_setup."""
10881094
from nemo_rl.models.megatron.setup import finalize_megatron_setup

0 commit comments

Comments
 (0)