Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -250,14 +250,23 @@ def forward(self, hidden_states):

# Copied from transformers.models.nemotron.modeling_nemotron Nemotron->NemotronH
class NemotronHMLP(nn.Module):
def __init__(self, config, layer_idx: int, intermediate_size: Optional[int] = None):
def __init__(
self,
config,
layer_idx: int,
intermediate_size: Optional[int] = None,
is_expert: bool = False,
):
super().__init__()
self.config = config
self.layer_idx = layer_idx
self.hidden_size = config.hidden_size
self.intermediate_size = intermediate_size or config.intermediate_size
self.up_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=config.mlp_bias)
self.down_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=config.mlp_bias)
# Use latent size for expert MLPs if provided by config (required for SuperV3)
use_latent_size = (getattr(self.config, "moe_latent_size", None) is not None) and is_expert
input_size = self.config.moe_latent_size if use_latent_size else self.hidden_size
self.up_proj = nn.Linear(input_size, self.intermediate_size, bias=config.mlp_bias)
self.down_proj = nn.Linear(self.intermediate_size, input_size, bias=config.mlp_bias)
self.act_fn = ACT2FN[config.mlp_hidden_act]

def forward(self, x):
Expand All @@ -271,7 +280,10 @@ def __init__(self, config, layer_idx: Optional[int] = None):
self.experts = nn.ModuleList(
[
NemotronHMLP(
config, intermediate_size=config.moe_intermediate_size, layer_idx=layer_idx
config,
layer_idx=layer_idx,
intermediate_size=config.moe_intermediate_size,
is_expert=True,
)
for _ in range(config.n_routed_experts)
]
Expand All @@ -281,7 +293,19 @@ def __init__(self, config, layer_idx: Optional[int] = None):
config=config,
intermediate_size=config.moe_shared_expert_intermediate_size,
layer_idx=layer_idx,
is_expert=False,
)
# Add latent projections when using latent MoE (required for SuperV3)
if getattr(config, "moe_latent_size", None) is not None:
self.fc1_latent_proj = nn.Linear(
config.hidden_size, config.moe_latent_size, bias=config.mlp_bias
)
self.fc2_latent_proj = nn.Linear(
config.moe_latent_size, config.hidden_size, bias=config.mlp_bias
)
else:
self.fc1_latent_proj = nn.Identity()
self.fc2_latent_proj = nn.Identity()

def forward(self, hidden_states: torch.Tensor):
residuals = hidden_states
Expand Down
10 changes: 5 additions & 5 deletions tests/integration/defs/accuracy/test_llm_api_autodeploy.py
Original file line number Diff line number Diff line change
Expand Up @@ -236,7 +236,7 @@ def test_fp8(self):

class TestNemotronSuperV3(LlmapiAccuracyTestHarness):
MODEL_NAME = "nvidia/Nemotron-Super-V3"
MODEL_PATH_BF16 = "/scratch/models/super-v3-iter_0440000/hf" # add to llm_models_root? I don't have permissions
MODEL_PATH_BF16 = f"{llm_models_root()}/Nemotron-Super-3-120B-A12B-dev"

def get_default_kwargs(self):
return {
Expand Down Expand Up @@ -265,15 +265,15 @@ def get_default_sampling_params(self):
n=beam_width,
use_beam_search=beam_width > 1)

@pytest.mark.skip_less_device_memory(
32000) # might need to require more memory
@pytest.mark.skip_less_device(8)
# 180GB works, might be able to go lower
@pytest.mark.skip_less_device_memory(180000)
@pytest.mark.skip_less_device(4)
def test_bf16(self):
kwargs = self.get_default_kwargs()
sampling_params = self.get_default_sampling_params()
with AutoDeployLLM(model=self.MODEL_PATH_BF16,
tokenizer=self.MODEL_PATH_BF16,
world_size=8,
world_size=4,
**kwargs) as llm:
task = MMLU(self.MODEL_NAME)
task.evaluate(llm, sampling_params=sampling_params)
Expand Down
2 changes: 2 additions & 0 deletions tests/integration/test_lists/test-db/l0_dgx_b200.yml
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,8 @@ l0_dgx_b200:
- disaggregated/test_disaggregated.py::test_disaggregated_deepseek_v3_lite_fp8_ucx[DeepSeek-V3-Lite-fp8]
- disaggregated/test_disaggregated.py::test_disaggregated_deepseek_v3_lite_fp8_nixl[DeepSeek-V3-Lite-fp8]
- accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_nvfp4_multi_gpus[latency_adp_lmtp_tp4]
# ------------- AutoDeploy tests ---------------
- accuracy/test_llm_api_autodeploy.py::TestNemotronMOE::test_bf16
- condition:
ranges:
system_gpu_count:
Expand Down
2 changes: 2 additions & 0 deletions tests/integration/test_lists/test-db/l0_dgx_h100.yml
Original file line number Diff line number Diff line change
Expand Up @@ -124,6 +124,8 @@ l0_dgx_h100:
- disaggregated/test_auto_scaling.py::test_worker_restart[http-load_balancing]
- disaggregated/test_auto_scaling.py::test_minimal_instances[http-round_robin]
- disaggregated/test_auto_scaling.py::test_disagg_server_restart[http-round_robin]
# ------------- AutoDeploy tests ---------------
- accuracy/test_llm_api_autodeploy.py::TestNemotronMOE::test_bf16
- condition:
ranges:
system_gpu_count:
Expand Down
1 change: 1 addition & 0 deletions tests/integration/test_lists/test-db/l0_dgx_h200.yml
Original file line number Diff line number Diff line change
Expand Up @@ -134,6 +134,7 @@ l0_dgx_h200:
# ------------- AutoDeploy tests ---------------
- accuracy/test_llm_api_autodeploy.py::TestLlama3_1_8B::test_auto_dtype[False-4]
- accuracy/test_llm_api_autodeploy.py::TestNemotronMOE::test_bf16
- accuracy/test_llm_api_autodeploy.py::TestNemotronSuperV3::test_bf16
- condition:
ranges:
system_gpu_count:
Expand Down