Skip to content

Commit ca1867f

Browse files
authored
Update to use latest MBridge (#589)
Signed-off-by: Charlie Truong <chtruong@nvidia.com>
1 parent 7872cef commit ca1867f

File tree

4 files changed

+287
-176
lines changed

4 files changed

+287
-176
lines changed

nemo_deploy/llm/inference/inference_base.py

Lines changed: 3 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,6 @@
2222
import torch
2323
from megatron.bridge.training.model_load_save import build_and_load_model, load_model_config, load_tokenizer
2424
from megatron.bridge.training.tokenizers.tokenizer import MegatronTokenizer
25-
from megatron.bridge.utils.vocab_utils import calculate_padded_vocab_size
2625
from megatron.core.dist_checkpointing.core import check_is_distributed_checkpoint
2726
from megatron.core.dist_checkpointing.serialization import (
2827
get_default_load_sharded_strategy,
@@ -33,9 +32,6 @@
3332
from megatron.core.inference.model_inference_wrappers.gpt.gpt_inference_wrapper import (
3433
GPTInferenceWrapper,
3534
)
36-
from megatron.core.inference.model_inference_wrappers.inference_wrapper_config import (
37-
InferenceWrapperConfig,
38-
)
3935
from megatron.core.inference.text_generation_controllers.text_generation_controller import (
4036
TextGenerationController,
4137
)
@@ -478,7 +474,7 @@ def create_mcore_engine(
478474
- GPTInferenceWrapper: Inference-wrapped model
479475
- Union[MCoreTokenizerWrappper, MegatronTokenizer]: Tokenizer instance
480476
"""
481-
if not HAVE_NEMO:
477+
if not HAVE_NEMO and model_format == "nemo":
482478
raise UnavailableError(MISSING_NEMO_MSG)
483479

484480
# Default to 1 for any parallelism dimension that's None
@@ -501,7 +497,6 @@ def create_mcore_engine(
501497
**model_config_kwargs,
502498
)
503499
model = modelList[0]
504-
padded_vocab_size = model.vocab_size
505500
elif model_format == "megatron":
506501
modelList, tokenizer, mlm_args = setup_megatron_model_and_tokenizer_for_inference(
507502
checkpoint_path=path,
@@ -513,27 +508,11 @@ def create_mcore_engine(
513508
model_type=model_type,
514509
)
515510
model = modelList[0]
516-
if mlm_args is not None:
517-
padded_vocab_size = getattr(mlm_args, "padded_vocab_size", None)
518-
else:
519-
padded_vocab_size = calculate_padded_vocab_size(
520-
model.config.vocab_size,
521-
model.config.make_vocab_size_divisible_by,
522-
model.config.tensor_model_parallel_size,
523-
)
524511
else:
525512
raise ValueError(f"Model format {model_format} not supported.")
526-
inference_wrapper_config = InferenceWrapperConfig(
527-
hidden_size=model.config.hidden_size,
528-
params_dtype=params_dtype,
529-
inference_batch_times_seqlen_threshold=inference_batch_times_seqlen_threshold,
530-
padded_vocab_size=padded_vocab_size,
531-
inference_max_seq_length=inference_max_seq_length,
532-
inference_max_requests=max_batch_size,
533-
)
534-
inference_context = StaticInferenceContext.from_config(inference_wrapper_config)
535513

536-
model_inference_wrapper = GPTInferenceWrapper(model, inference_wrapper_config, inference_context)
514+
inference_context = StaticInferenceContext(max_batch_size, inference_max_seq_length)
515+
model_inference_wrapper = GPTInferenceWrapper(model, inference_context)
537516
text_generation_controller = TextGenerationController(
538517
inference_wrapped_model=model_inference_wrapper, tokenizer=tokenizer
539518
)

pyproject.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -113,7 +113,7 @@ vllm = [
113113
{ index = "pytorch-cu130", marker = "python_version < '3.9' and platform_machine == 'x86_64'" },
114114
{ index = "pypi", marker = "platform_machine == 'aarch64'" },
115115
]
116-
megatron-bridge = { git = "https://github.com/NVIDIA-NeMo/Megatron-Bridge.git", rev = "7a50d2ee726ba17ea5e75acf4c56b0b53b43a0d6" }
116+
megatron-bridge = { git = "https://github.com/NVIDIA-NeMo/Megatron-Bridge.git", rev = "65a21df6bdafc198c26baa26b748fe55f3a19fd9" }
117117
# nemo-toolkit = { git = "https://github.com/NVIDIA/NeMo.git", rev = "main" }
118118

119119
[tool.uv]

tests/unit_tests/deploy/test_inference_base.py

Lines changed: 9 additions & 95 deletions
Original file line numberDiff line numberDiff line change
@@ -23,8 +23,14 @@
2323
GPTInferenceWrapper,
2424
)
2525
from megatron.core.transformer.module import MegatronModule
26-
from nemo.collections.llm.gpt.model.base import GPTConfig
27-
from nemo.collections.llm.inference.base import MCoreTokenizerWrappper
26+
27+
try:
28+
from nemo.collections.llm.gpt.model.base import GPTConfig
29+
from nemo.collections.llm.inference.base import MCoreTokenizerWrappper
30+
31+
HAVE_NEMO = True
32+
except (ImportError, ModuleNotFoundError):
33+
HAVE_NEMO = False
2834

2935
from nemo_deploy.llm.inference.inference_base import (
3036
MCoreEngineWithCleanup,
@@ -41,6 +47,7 @@
4147
from nemo_export_deploy_common.import_utils import UnavailableError
4248

4349

50+
@pytest.mark.skipif(not HAVE_NEMO, reason="NeMo is not installed")
4451
@pytest.mark.run_only_on("GPU")
4552
class TestInferenceBase(unittest.TestCase):
4653
def setUp(self):
@@ -291,99 +298,6 @@ def test_setup_model_calls_configure_model(
291298
# Verify that configure_model(tokenizer) was invoked
292299
self.mock_model.configure_model.assert_called_once_with(self.mock_tokenizer)
293300

294-
@patch("nemo_deploy.llm.inference.inference_base.HAVE_NEMO", True)
295-
@patch("nemo_deploy.llm.inference.inference_base.calculate_padded_vocab_size")
296-
@patch("nemo_deploy.llm.inference.inference_base.GPTInferenceWrapper")
297-
@patch("nemo_deploy.llm.inference.inference_base.TextGenerationController")
298-
@patch("nemo_deploy.llm.inference.inference_base.MCoreEngine")
299-
@patch("nemo_deploy.llm.inference.inference_base.StaticInferenceContext")
300-
@patch("nemo_deploy.llm.inference.inference_base.setup_megatron_model_and_tokenizer_for_inference")
301-
def test_create_mcore_engine_megatron_with_mlm_args(
302-
self,
303-
mock_setup_meg,
304-
mock_static_ctx,
305-
mock_engine_class,
306-
mock_tg_ctrl_class,
307-
mock_gpt_wrapper_class,
308-
mock_calc_pad_vocab,
309-
):
310-
# Prepare model.config used by InferenceWrapperConfig
311-
mock_model = MagicMock()
312-
mock_model.config = MagicMock()
313-
mock_model.config.hidden_size = 256
314-
mock_model.config.vocab_size = 32000
315-
mock_model.config.make_vocab_size_divisible_by = 128
316-
mock_model.config.tensor_model_parallel_size = 1
317-
318-
mock_tokenizer = MagicMock()
319-
320-
# mlm_args with explicit padded_vocab_size
321-
mlm_args = MagicMock()
322-
mlm_args.padded_vocab_size = 1234
323-
324-
mock_setup_meg.return_value = ([mock_model], mock_tokenizer, mlm_args)
325-
mock_static_ctx.from_config.return_value = MagicMock()
326-
327-
from nemo_deploy.llm.inference.inference_base import create_mcore_engine
328-
329-
create_mcore_engine(path=self.mock_path, model_format="megatron")
330-
331-
# Ensure we did NOT compute padded vocab when mlm_args provides it
332-
mock_calc_pad_vocab.assert_not_called()
333-
334-
# Validate padded_vocab_size flowed into GPTInferenceWrapper config
335-
args, kwargs = mock_gpt_wrapper_class.call_args
336-
inference_wrapper_config = args[1]
337-
self.assertEqual(inference_wrapper_config.padded_vocab_size, 1234)
338-
self.assertEqual(inference_wrapper_config.hidden_size, 256)
339-
340-
@patch("nemo_deploy.llm.inference.inference_base.HAVE_NEMO", True)
341-
@patch("nemo_deploy.llm.inference.inference_base.calculate_padded_vocab_size")
342-
@patch("nemo_deploy.llm.inference.inference_base.GPTInferenceWrapper")
343-
@patch("nemo_deploy.llm.inference.inference_base.TextGenerationController")
344-
@patch("nemo_deploy.llm.inference.inference_base.MCoreEngine")
345-
@patch("nemo_deploy.llm.inference.inference_base.StaticInferenceContext")
346-
@patch("nemo_deploy.llm.inference.inference_base.setup_megatron_model_and_tokenizer_for_inference")
347-
def test_create_mcore_engine_megatron_without_mlm_args_uses_calculated_padded_vocab(
348-
self,
349-
mock_setup_meg,
350-
mock_static_ctx,
351-
mock_engine_class,
352-
mock_tg_ctrl_class,
353-
mock_gpt_wrapper_class,
354-
mock_calc_pad_vocab,
355-
):
356-
# Prepare model.config used by InferenceWrapperConfig and pad calculation
357-
mock_model = MagicMock()
358-
mock_model.config = MagicMock()
359-
mock_model.config.hidden_size = 512
360-
mock_model.config.vocab_size = 30000
361-
mock_model.config.make_vocab_size_divisible_by = 128
362-
mock_model.config.tensor_model_parallel_size = 2
363-
364-
mock_tokenizer = MagicMock()
365-
366-
mock_setup_meg.return_value = ([mock_model], mock_tokenizer, None)
367-
mock_static_ctx.from_config.return_value = MagicMock()
368-
mock_calc_pad_vocab.return_value = 24576
369-
370-
from nemo_deploy.llm.inference.inference_base import create_mcore_engine
371-
372-
create_mcore_engine(path=self.mock_path, model_format="megatron")
373-
374-
# Ensure padded vocab was computed with expected args
375-
mock_calc_pad_vocab.assert_called_once_with(
376-
mock_model.config.vocab_size,
377-
mock_model.config.make_vocab_size_divisible_by,
378-
mock_model.config.tensor_model_parallel_size,
379-
)
380-
381-
# Validate padded_vocab_size flowed into GPTInferenceWrapper config
382-
args, kwargs = mock_gpt_wrapper_class.call_args
383-
inference_wrapper_config = args[1]
384-
self.assertEqual(inference_wrapper_config.padded_vocab_size, 24576)
385-
self.assertEqual(inference_wrapper_config.hidden_size, 512)
386-
387301
@patch("nemo_deploy.llm.inference.inference_base.check_is_distributed_checkpoint")
388302
@patch("nemo_deploy.llm.inference.inference_base.ckpt_to_weights_subdir")
389303
@patch("nemo_deploy.llm.inference.inference_base.ckpt_to_context_subdir")

0 commit comments

Comments
 (0)