Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
27 changes: 3 additions & 24 deletions nemo_deploy/llm/inference/inference_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,6 @@
import torch
from megatron.bridge.training.model_load_save import build_and_load_model, load_model_config, load_tokenizer
from megatron.bridge.training.tokenizers.tokenizer import MegatronTokenizer
from megatron.bridge.utils.vocab_utils import calculate_padded_vocab_size
from megatron.core.dist_checkpointing.core import check_is_distributed_checkpoint
from megatron.core.dist_checkpointing.serialization import (
get_default_load_sharded_strategy,
Expand All @@ -33,9 +32,6 @@
from megatron.core.inference.model_inference_wrappers.gpt.gpt_inference_wrapper import (
GPTInferenceWrapper,
)
from megatron.core.inference.model_inference_wrappers.inference_wrapper_config import (
InferenceWrapperConfig,
)
from megatron.core.inference.text_generation_controllers.text_generation_controller import (
TextGenerationController,
)
Expand Down Expand Up @@ -478,7 +474,7 @@ def create_mcore_engine(
- GPTInferenceWrapper: Inference-wrapped model
- Union[MCoreTokenizerWrappper, MegatronTokenizer]: Tokenizer instance
"""
if not HAVE_NEMO:
if not HAVE_NEMO and model_format == "nemo":
raise UnavailableError(MISSING_NEMO_MSG)

# Default to 1 for any parallelism dimension that's None
Expand All @@ -501,7 +497,6 @@ def create_mcore_engine(
**model_config_kwargs,
)
model = modelList[0]
padded_vocab_size = model.vocab_size
elif model_format == "megatron":
modelList, tokenizer, mlm_args = setup_megatron_model_and_tokenizer_for_inference(
checkpoint_path=path,
Expand All @@ -513,27 +508,11 @@ def create_mcore_engine(
model_type=model_type,
)
model = modelList[0]
if mlm_args is not None:
padded_vocab_size = getattr(mlm_args, "padded_vocab_size", None)
else:
padded_vocab_size = calculate_padded_vocab_size(
model.config.vocab_size,
model.config.make_vocab_size_divisible_by,
model.config.tensor_model_parallel_size,
)
else:
raise ValueError(f"Model format {model_format} not supported.")
inference_wrapper_config = InferenceWrapperConfig(
hidden_size=model.config.hidden_size,
params_dtype=params_dtype,
inference_batch_times_seqlen_threshold=inference_batch_times_seqlen_threshold,
padded_vocab_size=padded_vocab_size,
inference_max_seq_length=inference_max_seq_length,
inference_max_requests=max_batch_size,
)
inference_context = StaticInferenceContext.from_config(inference_wrapper_config)

model_inference_wrapper = GPTInferenceWrapper(model, inference_wrapper_config, inference_context)
inference_context = StaticInferenceContext(max_batch_size, inference_max_seq_length)
model_inference_wrapper = GPTInferenceWrapper(model, inference_context)
text_generation_controller = TextGenerationController(
inference_wrapped_model=model_inference_wrapper, tokenizer=tokenizer
)
Expand Down
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -113,7 +113,7 @@ vllm = [
{ index = "pytorch-cu130", marker = "python_version < '3.9' and platform_machine == 'x86_64'" },
{ index = "pypi", marker = "platform_machine == 'aarch64'" },
]
megatron-bridge = { git = "https://github.com/NVIDIA-NeMo/Megatron-Bridge.git", rev = "7a50d2ee726ba17ea5e75acf4c56b0b53b43a0d6" }
megatron-bridge = { git = "https://github.com/NVIDIA-NeMo/Megatron-Bridge.git", rev = "65a21df6bdafc198c26baa26b748fe55f3a19fd9" }
# nemo-toolkit = { git = "https://github.com/NVIDIA/NeMo.git", rev = "main" }

[tool.uv]
Expand Down
104 changes: 9 additions & 95 deletions tests/unit_tests/deploy/test_inference_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,8 +23,14 @@
GPTInferenceWrapper,
)
from megatron.core.transformer.module import MegatronModule
from nemo.collections.llm.gpt.model.base import GPTConfig
from nemo.collections.llm.inference.base import MCoreTokenizerWrappper

try:
from nemo.collections.llm.gpt.model.base import GPTConfig
from nemo.collections.llm.inference.base import MCoreTokenizerWrappper

HAVE_NEMO = True
except (ImportError, ModuleNotFoundError):
HAVE_NEMO = False

from nemo_deploy.llm.inference.inference_base import (
MCoreEngineWithCleanup,
Expand All @@ -41,6 +47,7 @@
from nemo_export_deploy_common.import_utils import UnavailableError


@pytest.mark.skipif(not HAVE_NEMO, reason="NeMo is not installed")
@pytest.mark.run_only_on("GPU")
class TestInferenceBase(unittest.TestCase):
def setUp(self):
Expand Down Expand Up @@ -291,99 +298,6 @@ def test_setup_model_calls_configure_model(
# Verify that configure_model(tokenizer) was invoked
self.mock_model.configure_model.assert_called_once_with(self.mock_tokenizer)

@patch("nemo_deploy.llm.inference.inference_base.HAVE_NEMO", True)
@patch("nemo_deploy.llm.inference.inference_base.calculate_padded_vocab_size")
@patch("nemo_deploy.llm.inference.inference_base.GPTInferenceWrapper")
@patch("nemo_deploy.llm.inference.inference_base.TextGenerationController")
@patch("nemo_deploy.llm.inference.inference_base.MCoreEngine")
@patch("nemo_deploy.llm.inference.inference_base.StaticInferenceContext")
@patch("nemo_deploy.llm.inference.inference_base.setup_megatron_model_and_tokenizer_for_inference")
def test_create_mcore_engine_megatron_with_mlm_args(
self,
mock_setup_meg,
mock_static_ctx,
mock_engine_class,
mock_tg_ctrl_class,
mock_gpt_wrapper_class,
mock_calc_pad_vocab,
):
# Prepare model.config used by InferenceWrapperConfig
mock_model = MagicMock()
mock_model.config = MagicMock()
mock_model.config.hidden_size = 256
mock_model.config.vocab_size = 32000
mock_model.config.make_vocab_size_divisible_by = 128
mock_model.config.tensor_model_parallel_size = 1

mock_tokenizer = MagicMock()

# mlm_args with explicit padded_vocab_size
mlm_args = MagicMock()
mlm_args.padded_vocab_size = 1234

mock_setup_meg.return_value = ([mock_model], mock_tokenizer, mlm_args)
mock_static_ctx.from_config.return_value = MagicMock()

from nemo_deploy.llm.inference.inference_base import create_mcore_engine

create_mcore_engine(path=self.mock_path, model_format="megatron")

# Ensure we did NOT compute padded vocab when mlm_args provides it
mock_calc_pad_vocab.assert_not_called()

# Validate padded_vocab_size flowed into GPTInferenceWrapper config
args, kwargs = mock_gpt_wrapper_class.call_args
inference_wrapper_config = args[1]
self.assertEqual(inference_wrapper_config.padded_vocab_size, 1234)
self.assertEqual(inference_wrapper_config.hidden_size, 256)

@patch("nemo_deploy.llm.inference.inference_base.HAVE_NEMO", True)
@patch("nemo_deploy.llm.inference.inference_base.calculate_padded_vocab_size")
@patch("nemo_deploy.llm.inference.inference_base.GPTInferenceWrapper")
@patch("nemo_deploy.llm.inference.inference_base.TextGenerationController")
@patch("nemo_deploy.llm.inference.inference_base.MCoreEngine")
@patch("nemo_deploy.llm.inference.inference_base.StaticInferenceContext")
@patch("nemo_deploy.llm.inference.inference_base.setup_megatron_model_and_tokenizer_for_inference")
def test_create_mcore_engine_megatron_without_mlm_args_uses_calculated_padded_vocab(
self,
mock_setup_meg,
mock_static_ctx,
mock_engine_class,
mock_tg_ctrl_class,
mock_gpt_wrapper_class,
mock_calc_pad_vocab,
):
# Prepare model.config used by InferenceWrapperConfig and pad calculation
mock_model = MagicMock()
mock_model.config = MagicMock()
mock_model.config.hidden_size = 512
mock_model.config.vocab_size = 30000
mock_model.config.make_vocab_size_divisible_by = 128
mock_model.config.tensor_model_parallel_size = 2

mock_tokenizer = MagicMock()

mock_setup_meg.return_value = ([mock_model], mock_tokenizer, None)
mock_static_ctx.from_config.return_value = MagicMock()
mock_calc_pad_vocab.return_value = 24576

from nemo_deploy.llm.inference.inference_base import create_mcore_engine

create_mcore_engine(path=self.mock_path, model_format="megatron")

# Ensure padded vocab was computed with expected args
mock_calc_pad_vocab.assert_called_once_with(
mock_model.config.vocab_size,
mock_model.config.make_vocab_size_divisible_by,
mock_model.config.tensor_model_parallel_size,
)

# Validate padded_vocab_size flowed into GPTInferenceWrapper config
args, kwargs = mock_gpt_wrapper_class.call_args
inference_wrapper_config = args[1]
self.assertEqual(inference_wrapper_config.padded_vocab_size, 24576)
self.assertEqual(inference_wrapper_config.hidden_size, 512)

@patch("nemo_deploy.llm.inference.inference_base.check_is_distributed_checkpoint")
@patch("nemo_deploy.llm.inference.inference_base.ckpt_to_weights_subdir")
@patch("nemo_deploy.llm.inference.inference_base.ckpt_to_context_subdir")
Expand Down
Loading
Loading