[TRTLLM-10329][feat] Fix weight loading for Nemotron 3 models on DGX Spark (#11405)

pamelap-nvidia · web-flow · commit 19a3031ecb0b · 2026-02-13T15:29:41.000-05:00
Signed-off-by: Pamela &lt;179191831+pamelap-nvidia@users.noreply.github.com&gt;
diff --git a/tensorrt_llm/_torch/models/checkpoints/hf/nemotron_h_weight_mapper.py b/tensorrt_llm/_torch/models/checkpoints/hf/nemotron_h_weight_mapper.py
@@ -80,7 +80,9 @@ def _split_mamba2_mixer_in_proj(w: torch.Tensor) -> torch.Tensor:
             elif "A" in key:
                 w = split(weights[name], tp_size, tp_rank)
                 w = w.to(torch.float32)
-                w = -torch.exp(w)
+                # Avoid extra temporaries: one fp32 cast, then in-place exp/neg.
+                w.exp_()
+                w.neg_()
                 new_weights[key] = w
             elif "D" in key:
                 w = split(weights[name], tp_size, tp_rank)
diff --git a/tensorrt_llm/_torch/modules/fused_moe/quantization.py b/tensorrt_llm/_torch/modules/fused_moe/quantization.py
@@ -23,7 +23,7 @@
 import torch.nn.functional as F
 from torch import nn
 
-from tensorrt_llm._utils import get_sm_version, is_sm_100f
+from tensorrt_llm._utils import get_sm_version, is_device_integrated, is_sm_100f
 from tensorrt_llm.logger import logger
 from tensorrt_llm.quantization.functional import \
     preprocess_weights_for_mixed_gemm
@@ -38,6 +38,7 @@
                       unswizzle_sf)
 from ..linear import TensorParallelMode, load_weight_shard
 from .interface import MoEWeightLoadingMode
+from .moe_load_balancer import advise_tensor_pageout
 
 # The declarations aligns with moe_kernels.h
 # pack inputs into int64, e.g. 4 x bf16 input values
@@ -306,6 +307,20 @@ def load_expert_weights_to_dst(
             w3_w1_kargs["allow_partial_loading"] = allow_partial_loading
         if "allow_partial_loading" in w2_args:
             w2_kargs["allow_partial_loading"] = allow_partial_loading
+
+        def maybe_pageout_mmapped_cpu_weights(
+                weight_tensors: List[object]) -> None:
+            # Integrated GPU systems share physical memory with CPU. After we
+            # finish copying from mmapped CPU weights, proactively advising the
+            # kernel to drop those pages reduces shared-memory pressure.
+            if not is_device_integrated():
+                return
+            for weight in weight_tensors:
+                if (isinstance(weight, torch.Tensor)
+                        and weight.device.type == "cpu"
+                        and weight.is_contiguous()):
+                    advise_tensor_pageout(weight)
+
         # Multithread weight load is superseded by prefetch_files() in model_engine.py
         # Also, threading adds overhead in order to protect shuffle index cache with critical section.
         for local_slot_id, expert_id in enumerate(load_expert_ids):
@@ -361,6 +376,7 @@ def load_expert_weights_to_dst(
                 if weight is not None
             ]
             module._add_raw_shared_weights_for_unmap(unmap_weights)
+            maybe_pageout_mmapped_cpu_weights(unmap_weights)
 
             if module.bias:
                 self.load_expert_w3_w1_weight(
@@ -375,6 +391,7 @@ def load_expert_weights_to_dst(
                     if weight is not None
                 ]
                 module._add_raw_shared_weights_for_unmap(unmap_weights)
+                maybe_pageout_mmapped_cpu_weights(unmap_weights)
 
     def load_weights(self,
                      module: torch.nn.Module,