NVIDIA · xxi-nv · Mar 26, 2026
@@ -304,6 +304,16 @@ def __pp_init__(self):
         pp_layer_list = mapping.pp_layers(num_hidden_layers)
         has_pp_layer = len(pp_layer_list) > 0
         for layer_idx, layer in enumerate(self.layers):
+            if layer_idx >= num_hidden_layers:
+                # Extra layers (e.g., MTP speculative layers) appended beyond
+                # the base model. Skip their forward on all ranks so they are
+                # no-ops in the main decoder loop, but preserve weights on the
+                # last PP rank where the MTP draft worker needs them.
+                if hasattr(layer, 'skip_forward'):
+                    layer.forward = layer.skip_forward
+                if not mapping.is_last_pp_rank():
+                    remove_weights(layer)
+                continue
             is_last_layer = (layer_idx == num_hidden_layers - 1)
             if layer_idx not in pp_layer_list:
                 # keep next layer's input_layernorm's weights for fusion

diff --git a/tests/integration/test_lists/waives.txt b/tests/integration/test_lists/waives.txt
@@ -276,7 +276,6 @@ unittest/_torch/ray_orchestrator/single_gpu/test_llm_update_weights.py::test_llm
 unittest/_torch/ray_orchestrator/single_gpu/test_llm_update_weights.py::test_llm_update_weights_with_quant_config[Qwen3/Qwen3-30B-A3B-Qwen3/Qwen3-30B-A3B-FP8] SKIP (https://nvbugspro.nvidia.com/bug/5911788)
 unittest/_torch/ray_orchestrator/single_gpu/test_llm_update_weights.py::test_llm_partial_update_weights[Qwen3/Qwen3-30B-A3B] SKIP (https://nvbugspro.nvidia.com/bug/5911788)
 unittest/_torch/ray_orchestrator/single_gpu/test_llm_update_weights.py::test_llm_partial_update_weights[Qwen3/Qwen3-8B] SKIP (https://nvbugspro.nvidia.com/bug/5911788)
-accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4_4gpus[moe_backend=CUTLASS-mtp_nextn=2-pp4-fp8kv=False-attention_dp=False-cuda_graph=False-overlap_scheduler=False-low_precision_combine=False-torch_compile=False] SKIP (https://nvbugspro.nvidia.com/bug/5916092)
 accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16_4gpus[pp4-mtp_nextn=0-attention_dp=False-cuda_graph=False-overlap_scheduler=False-torch_compile=True] SKIP (https://nvbugspro.nvidia.com/bug/5916155)
 accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16_4gpus[ep4-mtp_nextn=2-attention_dp=True-cuda_graph=True-overlap_scheduler=True-torch_compile=True] SKIP (https://nvbugspro.nvidia.com/bug/5916155)
 accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16_4gpus[pp4-mtp_nextn=2-attention_dp=True-cuda_graph=True-overlap_scheduler=True-torch_compile=True] SKIP (https://nvbugspro.nvidia.com/bug/5916155)