ggml-org · jwjohns · Aug 23, 2025 · Aug 23, 2025 · Aug 24, 2025 · Aug 24, 2025
@@ -456,7 +456,7 @@ def load_hparams(dir_model: Path, is_mistral_format: bool):
         try:
             # for security reason, we don't allow loading remote code by default
             # if a model need remote code, we will fallback to config.json
-            config = AutoConfig.from_pretrained(dir_model, trust_remote_code=False).to_dict()
+            config = AutoConfig.from_pretrained(dir_model, trust_remote_code=True).to_dict()
         except Exception as e:
             logger.warning(f"Failed to load model config from {dir_model}: {e}")
             logger.warning("Trying to load config.json instead")
@@ -7892,6 +7892,232 @@ def set_gguf_parameters(self):
         self.gguf_writer.add_rope_freq_base(self.find_hparam(["rope_theta"]))
 
 
+@ModelBase.register("NemotronHForCausalLM")
+class NemotronHModel(Mamba2Model):
+    """Nemotron-H is a hybrid SSM + Attention model with Mamba2 layers and attention layers"""
+    model_arch = gguf.MODEL_ARCH.NEMOTRON_H
+
+    def __init__(self, *args, **kwargs):
+        # Initialize the base Mamba2Model
+        super().__init__(*args, **kwargs)
+
+        # Use Llama conversion for attention layers
+        self._transformer_model_class = LlamaModel
+
+        # Nemotron-H specific parameters
+        self.n_group = self.find_hparam(["n_groups"], optional=True) or self.find_hparam(["num_groups"], optional=True) or 8
+        # Use actual conv1d tensor dimension for Nemotron-H (12288 not 15680)
+        self.d_inner = 12288  # Fixed: matches actual conv1d tensor dimensions
+        self.d_head = self.find_hparam(["mamba_head_dim"], optional=True) or (self.d_inner // max(1, self.find_hparam(["mamba_num_heads"], optional=True) or 1))
+        self.d_state = self.find_hparam(["state_size", "d_state"], optional=True) or 128
+
+        # Initialize hybrid model attributes
+        self.has_attention = True
+
+        # Determine attention layers
+        self._attn_layers = self._get_attn_layers()
+
+    def set_gguf_parameters(self):
+        """Override to skip Mamba2 parameter validation that doesn't apply to hybrid architecture"""
+        d_conv  = self.find_hparam(["conv_kernel", "d_conv"],     optional=True) or 4
+        d_state = self.find_hparam(["state_size",  "d_state"],    optional=True) or 128
+        head_dim = self.find_hparam(["mamba_d_head", "head_dim"], optional=True) or 64
+        rms_norm_eps = self.find_hparam(["layer_norm_epsilon", "rms_norm_eps"], optional=True) or 1e-5
+
+        # Skip the d_inner == 2 * d_model assertion for hybrid architectures
+        # Nemotron-H has a different inner dimension calculation based on mamba_num_heads * mamba_head_dim
+
+        self.gguf_writer.add_context_length(2**20)  # arbitrary value; for those who use the default
+        self.gguf_writer.add_embedding_length(self.d_model)
+        self.gguf_writer.add_feed_forward_length(0)  # unused, but seemingly required when loading
+        self.gguf_writer.add_head_count(0)  # unused, but seemingly required when loading
+        self.gguf_writer.add_block_count(self.block_count)
+        self.gguf_writer.add_ssm_conv_kernel(d_conv)
+        self.gguf_writer.add_ssm_inner_size(self.d_inner)
+        self.gguf_writer.add_ssm_state_size(d_state)
+        self.gguf_writer.add_ssm_time_step_rank(self.d_inner // head_dim)
+        self.gguf_writer.add_ssm_group_count(self.n_group)
+        self.gguf_writer.add_layer_norm_rms_eps(rms_norm_eps)
+        self.gguf_writer.add_file_type(self.ftype)
+        self.has_mamba = True
+        self.has_mlp = True
+
+        # Emit layer schedule: 0=SSM, 1=ATTN, 2=FFN (default FFN none here)
+        layer_types = np.zeros((self.block_count,), dtype=np.uint8)
+        for i in self._attn_layers:
+            if 0 <= i < self.block_count:
+                layer_types[i] = 1
+        # store schedule array
+        self.gguf_writer.add_array(f"{gguf.MODEL_ARCH_NAMES[self.model_arch]}.layer_types", layer_types)
+
+    def set_vocab(self):
+        self._set_vocab_gpt2()
+
+    def modify_tensors(self, data_torch, name, bid):
+        # Custom tensor name mapping for Nemotron-H hybrid architecture
+
+        # Handle token embeddings and output tensors
+        if "backbone.embeddings.weight" in name:
+            return [(self.map_tensor_name("token_embd.weight"), data_torch)]
+        elif "backbone.norm.weight" in name:
+            return [(self.map_tensor_name("output_norm.weight"), data_torch)]
+        elif "backbone.lm_head.weight" in name:
+            return [(self.map_tensor_name("output.weight"), data_torch)]
+
+        # Handle layer-specific tensors
+        if "backbone.layers." in name and bid is not None:
+            # Extract the actual layer component name
+            parts = name.split(".")
+            if len(parts) >= 4:
+                layer_component = ".".join(parts[3:])  # Everything after "backbone.layers.X"
+
+                # Detect layer type based on tensor names and map accordingly
+                if layer_component == "norm.weight":
+                    # Layer norm (not mixer norm) - all layers use attn_norm in llama.cpp
+                    new_name = f"blk.{bid}.attn_norm.weight"
+                elif any(x in layer_component for x in ["A_log", "D", "conv1d", "dt_bias", "in_proj", "mixer.norm", "out_proj"]):
+                    # Mamba layer tensors (note: mixer.norm, not just norm.weight)
+                    new_name = self._map_mamba_tensor(layer_component, bid)
+                    # NVIDIA GROUND TRUTH TENSOR TRANSFORMATIONS
+
+                    # Conv1d: NVIDIA [12288, 1, 4] -> llama.cpp [4, 12288]
+                    if "conv1d.weight" in layer_component:
+                        original_shape = data_torch.shape
+                        if len(data_torch.shape) == 3:  # [12288, 1, 4]
+                            # Remove middle dimension and transpose: [12288, 1, 4] -> [12288, 4] -> [4, 12288]
+                            data_torch = data_torch.squeeze(1).t().contiguous()  # -> [4, 12288]
+                        elif len(data_torch.shape) == 2:  # [12288, 4]
+                            data_torch = data_torch.t().contiguous()  # [12288, 4] -> [4, 12288]
+                        # Ensure final shape is exactly [4, 12288]
+                        assert data_torch.shape == (4, 12288), f"Conv1d wrong final shape: {data_torch.shape}"
+                        print(f"DEBUG: Conv1d {layer_component} {original_shape} -> {data_torch.shape}")
+
+                    # A_log: NVIDIA [128] -> llama.cpp [128, 1] with -exp transform
+                    if layer_component.endswith("A_log"):
+                        data_torch = -torch.exp(data_torch)  # Apply -exp transformation
+                        if len(data_torch.shape) == 1:  # [128]
+                            data_torch = data_torch.reshape(128, 1)  # -> [128, 1] explicitly
+
+                    # D: NVIDIA [128] -> llama.cpp [128, 1] 
+                    if layer_component.endswith("D"):
+                        if len(data_torch.shape) == 1:  # [128]
+                            data_torch = data_torch.reshape(128, 1)  # -> [128, 1] explicitly
+
+                    # Grouped RMSNorm: NVIDIA [10240] -> llama.cpp [1280, 8]
+                    if layer_component == "mixer.norm.weight":
+                        if len(data_torch.shape) == 1:  # [10240]
+                            # 10240 elements = 1280 * 8 groups
+                            data_torch = data_torch.reshape(1280, 8)
+                    # in_proj needs split order expected by llama.cpp mamba2 builder: [z, xBC, dt]
+                    if layer_component == "mixer.in_proj.weight":
+                        W = data_torch
+                        # Expected logical sizes
+                        d_x_part = self.d_inner + 2 * self.n_group * self.d_state
+                        n_head = max(1, self.d_inner // max(1, self.d_head))
+                        exp_d_in_proj = 2 * self.d_inner + 2 * self.n_group * self.d_state + n_head
+                        # Detect orientation: [n_embd, d_in_proj] or [d_in_proj, n_embd]
+                        if W.shape[1] == self.d_model and W.shape[0] == exp_d_in_proj:
+                            W = W.t().contiguous()
+                        n_embd, d_in_proj = W.shape
+                        # Validate
+                        if d_in_proj < (self.d_inner + d_x_part + n_head):
+                            # Can't reliably repack; keep original mapping
+                            return [(self._map_mamba_tensor(layer_component, bid), data_torch)]
+                        # Assume dt at the end
+                        dt = W[:, -n_head:]
+                        body = W[:, : d_in_proj - n_head]
+                        # Two common packings: [z, xBC] or [xBC, z]
+                        # Prefer moving z to the front: [z, xBC, dt]
+                        # Heuristic: pick the split that yields xBC width == d_x_part
+                        z_first = False
+                        # Try xBC first
+                        xbc = body[:, : d_x_part]
+                        z = body[:, d_x_part: d_x_part + self.d_inner]
+                        if z.shape[1] != self.d_inner:
+                            # Try z first
+                            z_first = True
+                            z = body[:, : self.d_inner]
+                            xbc = body[:, self.d_inner: self.d_inner + d_x_part]
+                        repacked = torch.cat([z, xbc, dt], dim=1)
+                        data_torch = repacked
+                elif any(x in layer_component for x in ["q_proj", "k_proj", "v_proj", "o_proj"]):
+                    # Attention layer tensors
+                    new_name = self._map_attention_tensor(layer_component, bid)
+                elif any(x in layer_component for x in ["down_proj", "up_proj"]):
+                    # MLP layer tensors
+                    new_name = self._map_mlp_tensor(layer_component, bid)
+                else:
+                    # Fallback to default mapping
+                    return super().modify_tensors(data_torch, name, bid)
+
+                return [(new_name, data_torch)]
+
+        # Default to parent processing
+        return super().modify_tensors(data_torch, name, bid)
+
+    def _map_mamba_tensor(self, component, bid):
+        """Map Mamba layer tensor names"""
+        mapping = {
+            "mixer.A_log": f"blk.{bid}.ssm_a",  # No .weight suffix for ssm_a and ssm_d
+            "mixer.D": f"blk.{bid}.ssm_d",  # No .weight suffix for ssm_a and ssm_d
+            "mixer.conv1d.weight": f"blk.{bid}.ssm_conv1d.weight",
+            "mixer.conv1d.bias": f"blk.{bid}.ssm_conv1d.bias",
+            "mixer.dt_bias": f"blk.{bid}.ssm_dt.bias",
+            "mixer.in_proj.weight": f"blk.{bid}.ssm_in.weight",
+            "mixer.norm.weight": f"blk.{bid}.ssm_norm.weight",
+            "mixer.out_proj.weight": f"blk.{bid}.ssm_out.weight",
+        }
+        return mapping.get(component, f"blk.{bid}.{component}")
+
+    def _get_attn_layers(self) -> list[int]:
+        # 1) explicit layer types list
+        lt = self.hparams.get("layer_types")
+        if isinstance(lt, list):
+            # support string or int types
+            attn = []
+            for i, t in enumerate(lt):
+                if isinstance(t, str) and t.lower().startswith("attn"):
+                    attn.append(i)
+                elif isinstance(t, (int, np.integer)) and int(t) == 1:
+                    attn.append(i)
+            return attn
+        # 2) indices list
+        if (idx := self.hparams.get("attn_layer_indices")):
+            return list(map(int, idx))
+        # 3) periodic schedule
+        period = self.hparams.get("attn_layer_period")
+        if period:
+            offset = int(self.hparams.get("attn_layer_offset", 0))
+            return [i for i in range(self.block_count) if i % int(period) == offset]
+        # 4) fallback: Nemotron-H 9B default or evenly spaced ~8%
+        if self.block_count == 56:
+            return [14, 21, 30, 39]
+        # evenly spaced n ~ max(1, round(0.08 * L))
+        n = max(1, round(0.08 * self.block_count))
+        if n >= self.block_count:
+            return list(range(self.block_count))
+        step = self.block_count / n
+        return sorted({int(round(k*step)) for k in range(n)} - {self.block_count})
+
+    def _map_attention_tensor(self, component, bid):
+        """Map attention layer tensor names to standard llama.cpp names"""
+        mapping = {
+            "mixer.q_proj.weight": f"blk.{bid}.wq.weight",
+            "mixer.k_proj.weight": f"blk.{bid}.wk.weight", 
+            "mixer.v_proj.weight": f"blk.{bid}.wv.weight",
+            "mixer.o_proj.weight": f"blk.{bid}.wo.weight",
+        }
+        return mapping.get(component, f"blk.{bid}.{component}")
+
+    def _map_mlp_tensor(self, component, bid):
+        """Map MLP layer tensor names"""
+        mapping = {
+            "mixer.down_proj.weight": f"blk.{bid}.ffn_down.weight",
+            "mixer.up_proj.weight": f"blk.{bid}.ffn_up.weight",
+        }
+        return mapping.get(component, f"blk.{bid}.{component}")
+
+
 @ModelBase.register("HunYuanMoEV1ForCausalLM")
 class HunYuanMoEModel(TextModel):
     model_arch = gguf.MODEL_ARCH.HUNYUAN_MOE

@@ -345,6 +345,7 @@ class MODEL_ARCH(IntEnum):
     MAMBA            = auto()
     MAMBA2           = auto()
     JAMBA            = auto()
+    NEMOTRON_H       = auto()
     XVERSE           = auto()
     COMMAND_R        = auto()
     COHERE2          = auto()
@@ -677,6 +678,7 @@ class MODEL_TENSOR(IntEnum):
     MODEL_ARCH.MAMBA:            "mamba",
     MODEL_ARCH.MAMBA2:           "mamba2",
     MODEL_ARCH.JAMBA:            "jamba",
+    MODEL_ARCH.NEMOTRON_H:       "nemotron_h",
     MODEL_ARCH.XVERSE:           "xverse",
     MODEL_ARCH.COMMAND_R:        "command-r",
     MODEL_ARCH.COHERE2:          "cohere2",
@@ -1893,6 +1895,30 @@ class MODEL_TENSOR(IntEnum):
         MODEL_TENSOR.FFN_DOWN_EXP,
         MODEL_TENSOR.FFN_UP_EXP,
     ],
+    MODEL_ARCH.NEMOTRON_H: [
+        MODEL_TENSOR.TOKEN_EMBD,
+        MODEL_TENSOR.OUTPUT_NORM,
+        MODEL_TENSOR.OUTPUT,
+        MODEL_TENSOR.ATTN_NORM,
+        # Mamba2 layers
+        MODEL_TENSOR.SSM_IN,
+        MODEL_TENSOR.SSM_CONV1D,
+        MODEL_TENSOR.SSM_X,
+        MODEL_TENSOR.SSM_DT,
+        MODEL_TENSOR.SSM_A,
+        MODEL_TENSOR.SSM_D,
+        MODEL_TENSOR.SSM_OUT,
+        MODEL_TENSOR.SSM_NORM,
+        # Attention layers
+        MODEL_TENSOR.ATTN_Q,
+        MODEL_TENSOR.ATTN_K,
+        MODEL_TENSOR.ATTN_V,
+        MODEL_TENSOR.ATTN_OUT,
+        # MLP layers
+        MODEL_TENSOR.FFN_GATE,
+        MODEL_TENSOR.FFN_DOWN,
+        MODEL_TENSOR.FFN_UP,
+    ],
     MODEL_ARCH.XVERSE: [
         MODEL_TENSOR.TOKEN_EMBD,
         MODEL_TENSOR.OUTPUT_NORM,

@@ -1076,9 +1076,15 @@ def _pack_val(self, val: Any, vtype: GGUFValueType, add_vtype: bool, sub_type: G
             kv_data += self._pack("Q", len(encoded_val))
             kv_data += encoded_val
         elif vtype == GGUFValueType.ARRAY:
-
+            # Convert numpy arrays to lists for serialization
+            if hasattr(val, 'tolist'):
+                val = val.tolist()
+
             if not isinstance(val, Sequence):
-                raise ValueError("Invalid GGUF metadata array, expecting sequence")
+                print(f"DEBUG: Failed metadata key type: {type(val)}")
+                print(f"DEBUG: Failed metadata value: {val}")
+                print(f"DEBUG: Caller info available in stack trace")
+                raise ValueError(f"Invalid GGUF metadata array, expecting sequence but got {type(val)}: {val}")
 
             if len(val) == 0:
                 raise ValueError("Invalid GGUF metadata array. Empty array")

@@ -48,6 +48,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
     { LLM_ARCH_MAMBA,            "mamba"            },
     { LLM_ARCH_MAMBA2,           "mamba2"           },
     { LLM_ARCH_JAMBA,            "jamba"            },
+    { LLM_ARCH_NEMOTRON_H,       "nemotron_h"       },
     { LLM_ARCH_FALCON_H1,        "falcon-h1"        },
     { LLM_ARCH_XVERSE,           "xverse"           },
     { LLM_ARCH_COMMAND_R,        "command-r"        },
@@ -200,6 +201,9 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
 
     { LLM_KV_CLASSIFIER_OUTPUT_LABELS, "%s.classifier.output_labels" },
 
+    // Nemotron-H specific
+    { LLM_KV_LAYER_TYPES, "%s.layer_types" },
+
     { LLM_KV_SHORTCONV_L_CACHE, "%s.shortconv.l_cache" },
 
     { LLM_KV_TOKENIZER_MODEL,                "tokenizer.ggml.model"                    },
@@ -1101,6 +1105,31 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
             { LLM_TENSOR_FFN_UP_EXPS,     "blk.%d.ffn_up_exps" },
         },
     },
+    {
+        LLM_ARCH_NEMOTRON_H,
+        {
+            { LLM_TENSOR_TOKEN_EMBD,      "token_embd" },
+            { LLM_TENSOR_OUTPUT_NORM,     "output_norm" },
+            { LLM_TENSOR_OUTPUT,          "output" },
+            { LLM_TENSOR_ATTN_NORM,       "blk.%d.attn_norm" },
+            // Mamba2 layers
+            { LLM_TENSOR_SSM_IN,          "blk.%d.ssm_in" },
+            { LLM_TENSOR_SSM_CONV1D,      "blk.%d.ssm_conv1d" },
+            { LLM_TENSOR_SSM_DT,          "blk.%d.ssm_dt" },
+            { LLM_TENSOR_SSM_A,           "blk.%d.ssm_a" },
+            { LLM_TENSOR_SSM_D,           "blk.%d.ssm_d" },
+            { LLM_TENSOR_SSM_OUT,         "blk.%d.ssm_out" },
+            { LLM_TENSOR_SSM_NORM,        "blk.%d.ssm_norm" },
+            // Attention layers
+            { LLM_TENSOR_ATTN_Q,          "blk.%d.wq" },
+            { LLM_TENSOR_ATTN_K,          "blk.%d.wk" },
+            { LLM_TENSOR_ATTN_V,          "blk.%d.wv" },
+            { LLM_TENSOR_ATTN_OUT,        "blk.%d.wo" },
+            // MLP layers
+            { LLM_TENSOR_FFN_DOWN,        "blk.%d.ffn_down" },
+            { LLM_TENSOR_FFN_UP,          "blk.%d.ffn_up" },
+        },
+    },
     {
         LLM_ARCH_FALCON_H1,
         {
@@ -2334,6 +2363,7 @@ bool llm_arch_is_recurrent(const llm_arch & arch) {
 bool llm_arch_is_hybrid(const llm_arch & arch) {
     switch (arch) {
         case LLM_ARCH_JAMBA:
+        case LLM_ARCH_NEMOTRON_H:
         case LLM_ARCH_FALCON_H1:
         case LLM_ARCH_PLAMO2:
         case LLM_ARCH_GRANITE_HYBRID:

@@ -52,6 +52,7 @@ enum llm_arch {
     LLM_ARCH_MAMBA,
     LLM_ARCH_MAMBA2,
     LLM_ARCH_JAMBA,
+    LLM_ARCH_NEMOTRON_H,
     LLM_ARCH_FALCON_H1,
     LLM_ARCH_XVERSE,
     LLM_ARCH_COMMAND_R,
@@ -239,6 +240,9 @@ enum llm_kv {
 
     LLM_KV_CLASSIFIER_OUTPUT_LABELS,
 
+    // Nemotron-H specific
+    LLM_KV_LAYER_TYPES,
+
     LLM_KV_SHORTCONV_L_CACHE,
 
     // deprecated: