load hook fix

greg-kwasniewski1 · greg-kwasniewski1 · commit f3116e554ae2 · 2025-12-31T07:06:46.000-08:00
Signed-off-by: greg-kwasniewski1 &lt;213329731+greg-kwasniewski1@users.noreply.github.com&gt;
diff --git a/tensorrt_llm/_torch/auto_deploy/models/custom/modeling_nemotron_h.py b/tensorrt_llm/_torch/auto_deploy/models/custom/modeling_nemotron_h.py
@@ -128,8 +128,6 @@ def __init__(self, config, layer_idx: int):
         self.out_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=config.use_bias)
         self.use_bias = config.use_bias
 
-        self.register_load_state_dict_pre_hook(self._load_state_dict_pre_hook)
-
     def torch_forward(self, input_states):
         batch_size, seq_len, _ = input_states.shape
         dtype = input_states.dtype
@@ -191,14 +189,6 @@ def torch_forward(self, input_states):
     def forward(self, hidden_states):
         return self.torch_forward(hidden_states)
 
-    @staticmethod
-    def _load_state_dict_pre_hook(module, state_dict, prefix, local_metadata, strict,
-                                  missing_keys, unexpected_keys, error_msgs) -> None:
-        A_log_key = prefix + "A_log"
-        A_minus_key = prefix + "A_minus"
-        if A_log_key in state_dict:
-            state_dict[A_minus_key] = -torch.exp(state_dict.pop(A_log_key).float())
-
 
 class NemotronHRMSNorm(nn.Module):
     def __init__(self, hidden_size, eps=1e-6):
@@ -592,6 +582,13 @@ def __init__(self, config):
         self.backbone = NemotronHModel(config)
         self.vocab_size = config.vocab_size
         self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
+        # Recursively iterate over all modules in self.backbone and list those with A_minus or A_log in their name
+        self.backbone_modules_with_A = []
+        for module_name, module in self.backbone.named_modules():
+            for param_name, _ in module.named_parameters(recurse=False):
+                if param_name in ("A_minus", "A_log"):
+                    self.register_load_state_dict_pre_hook(self._a_log_pre_hook)
+                    self.backbone_modules_with_A.append((module_name, param_name))
 
         # Initialize weights and apply final processing
         self.post_init()
@@ -622,5 +619,23 @@ def forward(
 
         return NemotronHCausalLMOutput(logits)
 
+    @staticmethod
+    def _a_log_pre_hook(
+        module,
+        state_dict,
+        prefix,
+        local_metadata,
+        strict,
+        missing_keys,
+        unexpected_keys,
+        error_msgs,
+    ) -> None:
+        all_keys = list(state_dict.keys())
+        for key in all_keys:
+            if "A_log" in key:
+                A_log_key = key
+                A_minus_key = key.replace("A_log", "A_minus")
+                state_dict[A_minus_key] = -torch.exp(state_dict.pop(A_log_key).float())
+
 
 AutoModelForCausalLMFactory.register_custom_model_cls("NemotronHConfig", NemotronHForCausalLM)
diff --git a/tensorrt_llm/_torch/auto_deploy/transform/library/sharding.py b/tensorrt_llm/_torch/auto_deploy/transform/library/sharding.py
@@ -896,13 +896,10 @@ def _load_hook(
     # This is quite a hacky solution. A better solution would be to store extra_state in
     # the state_dict to identify whether the state_dict is sharded or not.
     key = prefix + param_key
-    ad_logger.debug(f"Sharder LOAD hook is called for '{key}'")
     if key not in state_dict:
         return
     p_to_load = state_dict[key]
-
     p_to_load = p_to_load if param_shape == p_to_load.shape else f_split(p_to_load)
-
     state_dict[key] = p_to_load