ggml-org · jwjohns · Aug 23, 2025 · Aug 23, 2025 · Aug 24, 2025 · Aug 24, 2025
@@ -296,9 +296,17 @@ def prepare_tensors(self):
                     break
 
             for new_name, data_torch in (self.modify_tensors(data_torch, name, bid)):
+                # Debug tensor shape tracking
+                if any(x in new_name for x in ["ssm_a", "ssm_d", "ssm_conv1d.weight"]):
+                    print(f"DEBUG: Pre-numpy {new_name} torch shape: {data_torch.shape}")
+
                 # TODO: why do we squeeze here?
                 # data = data_torch.squeeze().numpy()
                 data = data_torch.numpy()
+
+                # Debug numpy shape
+                if any(x in new_name for x in ["ssm_a", "ssm_d", "ssm_conv1d.weight"]):
+                    print(f"DEBUG: Post-numpy {new_name} numpy shape: {data.shape}")
 
                 # if data ends up empty, it means data_torch was a scalar tensor -> restore
                 if len(data.shape) == 0:
@@ -384,6 +392,11 @@ def prepare_tensors(self):
 
                 shape = gguf.quant_shape_from_byte_shape(data.shape, data_qtype) if data.dtype == np.uint8 else data.shape
 
+                # Debug shape before and after reversal
+                if any(x in new_name for x in ["ssm_a", "ssm_d", "ssm_conv1d.weight"]):
+                    print(f"DEBUG: {new_name} raw shape: {shape}")
+                    print(f"DEBUG: {new_name} reversed: {list(reversed(shape))}")
+
                 # reverse shape to make it similar to the internal ggml dimension order
                 shape_str = f"{{{', '.join(str(n) for n in reversed(shape))}}}"
 
@@ -456,7 +469,7 @@ def load_hparams(dir_model: Path, is_mistral_format: bool):
         try:
             # for security reason, we don't allow loading remote code by default
             # if a model need remote code, we will fallback to config.json
-            config = AutoConfig.from_pretrained(dir_model, trust_remote_code=False).to_dict()
+            config = AutoConfig.from_pretrained(dir_model, trust_remote_code=True).to_dict()
         except Exception as e:
             logger.warning(f"Failed to load model config from {dir_model}: {e}")
             logger.warning("Trying to load config.json instead")
@@ -7894,6 +7907,223 @@ def set_gguf_parameters(self):
         self.gguf_writer.add_rope_freq_base(self.find_hparam(["rope_theta"]))
 
 
+@ModelBase.register("NemotronHForCausalLM")
+class NemotronHModel(Mamba2Model):
+    """Nemotron-H is a hybrid SSM + Attention model with Mamba2 layers and attention layers"""
+    model_arch = gguf.MODEL_ARCH.NEMOTRON_H
+
+    def __init__(self, *args, **kwargs):
+        # Initialize the base Mamba2Model
+        super().__init__(*args, **kwargs)
+
+        # Nemotron-H specific parameters with Gabe's fixes
+        self.n_group = self.find_hparam(["n_groups"], optional=True) or self.find_hparam(["num_groups"], optional=True) or 8
+        # Use actual conv1d tensor dimension for Nemotron-H (12288 not 15680)
+        self.d_inner = 12288  # Fixed: matches actual conv1d tensor dimensions
+        self.d_head = self.find_hparam(["mamba_head_dim"], optional=True) or (self.d_inner // max(1, self.find_hparam(["mamba_num_heads"], optional=True) or 1))
+        self.d_state = self.find_hparam(["state_size", "d_state"], optional=True) or 128
+
+        # Initialize hybrid model attributes
+        self.has_attention = True
+        self._attn_layers = self._get_attn_layers()
+
+    def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
+        """Consolidated Nemotron-H tensor transformation with Gabe's fixes applied"""
+
+        # Handle backbone prefix mapping
+        if name.startswith("model.backbone") or name.startswith("model.lm_head"):
+            name = name.removeprefix("model.")
+
+        # Handle token embeddings and output tensors
+        if "backbone.embeddings.weight" in name:
+            yield (self.map_tensor_name("token_embd.weight"), data_torch)
+            return
+        elif "backbone.norm.weight" in name:
+            yield (self.map_tensor_name("output_norm.weight"), data_torch)
+            return
+        elif "backbone.lm_head.weight" in name:
+            yield (self.map_tensor_name("output.weight"), data_torch)
+            return
+
+        # Handle layer-specific tensors with improved logic
+        if "backbone.layers." in name and bid is not None:
+            parts = name.split(".")
+            if len(parts) >= 4:
+                layer_component = ".".join(parts[3:])
+
+                # Detect and map layer types
+                if layer_component == "norm.weight":
+                    new_name = f"blk.{bid}.attn_norm.weight"
+                elif any(x in layer_component for x in ["A_log", "D", "conv1d", "dt_bias", "in_proj", "mixer.norm", "out_proj"]):
+                    new_name = self._map_mamba_tensor(layer_component, bid)
+
+                    # Apply Gabe's tensor transformations with specific fixes
+                    if layer_component == "mixer.conv1d.weight":
+                        # Conv1d: NVIDIA [12288, 1, 4] -> llama.cpp [4, 12288] with BOS alignment fix
+                        if len(data_torch.shape) == 3:  # [12288, 1, 4]
+                            data_torch = data_torch.squeeze(1)  # -> [12288, 4]
+                        if len(data_torch.shape) == 2:
+                            data_torch = data_torch.t().contiguous()  # -> [4, 12288] for BOS alignment
+                        logger.debug(f"Conv1d BOS alignment: {data_torch.shape}")
+                    elif layer_component.endswith("A_log"):
+                        # A_log transformation with proper dimensions
+                        data_torch = -torch.exp(data_torch)
+                        if len(data_torch.shape) == 1:
+                            data_torch = data_torch.unsqueeze(1)  # -> [128, 1] explicitly
+                        logger.debug(f"A_log transformation: {data_torch.shape}")
+                    elif layer_component.endswith("D"):
+                        # D tensor proper dimensions  
+                        if len(data_torch.shape) == 1:
+                            data_torch = data_torch.unsqueeze(1)  # -> [128, 1] explicitly
+                        logger.debug(f"D tensor shape: {data_torch.shape}")
+                    elif layer_component == "mixer.norm.weight":
+                        # Apply Gabe's flattened RMS norm fix for n_groups=8
+                        if len(data_torch.shape) == 1:  # [10240]
+                            # Calculate correct dimensions: 10240 elements with n_groups=8 -> [1280, 8]
+                            elements_per_group = data_torch.numel() // self.n_group
+                            data_torch = data_torch.reshape((elements_per_group, self.n_group))
+                        logger.debug(f"SSM norm reshape for n_groups={self.n_group}: {data_torch.shape}")
+
+                elif any(x in layer_component for x in ["q_proj", "k_proj", "v_proj", "o_proj"]):
+                    new_name = self._map_attention_tensor(layer_component, bid)
+                elif any(x in layer_component for x in ["down_proj", "up_proj"]):
+                    new_name = self._map_mlp_tensor(layer_component, bid)
+                else:
+                    # If we can't map it in the layer-specific logic, fall back to parent mapping
+                    if name.endswith(".dt_bias"):
+                        name = name.rpartition(".dt_bias")[0] + ".dt_proj.bias"
+                    new_name = self.map_tensor_name(name)
+        else:
+            # For non-layer tensors, apply standard mapping
+            if name.endswith(".dt_bias"):
+                name = name.rpartition(".dt_bias")[0] + ".dt_proj.bias"
+            new_name = self.map_tensor_name(name)
+
+        # Handle base Mamba2 tensor transformations for backward compatibility
+        if self.match_model_tensor_name(new_name, gguf.MODEL_TENSOR.SSM_CONV1D, bid):
+            if len(data_torch.shape) == 3:  # [12288, 1, 4]
+                data_torch = data_torch.squeeze(1)  # -> [12288, 4]
+            if len(data_torch.shape) == 2:
+                data_torch = data_torch.t().contiguous()  # -> [4, 12288]
+        elif any(self.match_model_tensor_name(new_name, t, bid, suffix="") for t in [
+            gguf.MODEL_TENSOR.SSM_A, gguf.MODEL_TENSOR.SSM_D,
+        ]):
+            if len(data_torch.shape) == 1:
+                data_torch = data_torch.unsqueeze(1)  # -> [128, 1] explicitly
+        elif self.match_model_tensor_name(new_name, gguf.MODEL_TENSOR.SSM_NORM, bid):
+            if len(data_torch.shape) == 1:  # [10240]
+                elements_per_group = data_torch.numel() // self.n_group
+                data_torch = data_torch.reshape((elements_per_group, self.n_group))
+
+        # Apply A_log transformation for base cases
+        if name.endswith(".A_log"):
+            data_torch = -torch.exp(data_torch)
+
+        yield (new_name, data_torch)
+
+    def set_gguf_parameters(self):
+        """Override to skip Mamba2 parameter validation that doesn't apply to hybrid architecture"""
+        d_conv  = self.find_hparam(["conv_kernel", "d_conv"],     optional=True) or 4
+        d_state = self.find_hparam(["state_size",  "d_state"],    optional=True) or 128
+        head_dim = self.find_hparam(["mamba_d_head", "head_dim"], optional=True) or 64
+        rms_norm_eps = self.find_hparam(["layer_norm_epsilon", "rms_norm_eps"], optional=True) or 1e-5
+
+        self.gguf_writer.add_context_length(2**20)  # arbitrary value; for those who use the default
+        self.gguf_writer.add_embedding_length(self.d_model)
+        self.gguf_writer.add_feed_forward_length(0)  # unused, but seemingly required when loading
+        self.gguf_writer.add_head_count(0)  # unused, but seemingly required when loading
+        self.gguf_writer.add_block_count(self.block_count)
+        self.gguf_writer.add_ssm_conv_kernel(d_conv)
+        self.gguf_writer.add_ssm_inner_size(self.d_inner)
+        self.gguf_writer.add_ssm_state_size(d_state)
+        self.gguf_writer.add_ssm_time_step_rank(self.d_inner // head_dim)
+        self.gguf_writer.add_ssm_group_count(self.n_group)
+        self.gguf_writer.add_layer_norm_rms_eps(rms_norm_eps)
+        self.gguf_writer.add_file_type(self.ftype)
+        self.has_mamba = True
+        self.has_mlp = True
+
+        # Emit layer schedule: 0=SSM, 1=ATTN, 2=FFN (default FFN none here)
+        layer_types = np.zeros((self.block_count,), dtype=np.uint8)
+        for i in self._attn_layers:
+            if 0 <= i < self.block_count:
+                layer_types[i] = 1
+        # store schedule array
+        self.gguf_writer.add_array(f"{gguf.MODEL_ARCH_NAMES[self.model_arch]}.layer_types", layer_types)
+
+    def set_vocab(self):
+        # BOS token handling fix from Gabe's findings - ensures tensor alignment through first conv1d
+        self._set_vocab_gpt2()
+
+        # Nemotron-H specific BOS token configuration
+        try:
+            # Force BOS token ID to align with model expectations
+            self.gguf_writer.add_bos_token_id(1)  # Standard GPT-2 style BOS token
+            logger.info("Applied Nemotron-H BOS token fix for conv1d alignment")
+        except Exception as e:
+            logger.debug(f"BOS token already set or unavailable: {e}")
+
+    def _map_mamba_tensor(self, component, bid):
+        """Map Mamba layer tensor names"""
+        mapping = {
+            "mixer.A_log": f"blk.{bid}.ssm_a",
+            "mixer.D": f"blk.{bid}.ssm_d",
+            "mixer.conv1d.weight": f"blk.{bid}.ssm_conv1d.weight",
+            "mixer.conv1d.bias": f"blk.{bid}.ssm_conv1d.bias",
+            "mixer.dt_bias": f"blk.{bid}.ssm_dt.bias",
+            "mixer.in_proj.weight": f"blk.{bid}.ssm_in.weight",
+            "mixer.norm.weight": f"blk.{bid}.ssm_norm.weight",
+            "mixer.out_proj.weight": f"blk.{bid}.ssm_out.weight",
+        }
+        return mapping.get(component, f"blk.{bid}.{component}")
+
+    def _get_attn_layers(self) -> list[int]:
+        # 1) explicit layer types list
+        lt = self.hparams.get("layer_types")
+        if isinstance(lt, list):
+            attn = []
+            for i, t in enumerate(lt):
+                if isinstance(t, str) and t.lower().startswith("attn"):
+                    attn.append(i)
+                elif isinstance(t, (int, np.integer)) and int(t) == 1:
+                    attn.append(i)
+            return attn
+        # 2) indices list
+        if (idx := self.hparams.get("attn_layer_indices")):
+            return list(map(int, idx))
+        # 3) periodic schedule
+        period = self.hparams.get("attn_layer_period")
+        if period:
+            offset = int(self.hparams.get("attn_layer_offset", 0))
+            return [i for i in range(self.block_count) if i % int(period) == offset]
+        # 4) fallback: Nemotron-H 9B default or evenly spaced ~8%
+        if self.block_count == 56:
+            return [14, 21, 30, 39]
+        n = max(1, round(0.08 * self.block_count))
+        if n >= self.block_count:
+            return list(range(self.block_count))
+        step = self.block_count / n
+        return sorted({int(round(k*step)) for k in range(n)} - {self.block_count})
+
+    def _map_attention_tensor(self, component, bid):
+        """Map attention layer tensor names to standard llama.cpp names"""
+        mapping = {
+            "mixer.q_proj.weight": f"blk.{bid}.wq.weight",
+            "mixer.k_proj.weight": f"blk.{bid}.wk.weight", 
+            "mixer.v_proj.weight": f"blk.{bid}.wv.weight",
+            "mixer.o_proj.weight": f"blk.{bid}.wo.weight",
+        }
+        return mapping.get(component, f"blk.{bid}.{component}")
+
+    def _map_mlp_tensor(self, component, bid):
+        """Map MLP layer tensor names"""
+        mapping = {
+            "mixer.down_proj.weight": f"blk.{bid}.ffn_down.weight",
+            "mixer.up_proj.weight": f"blk.{bid}.ffn_up.weight",
+        }
+        return mapping.get(component, f"blk.{bid}.{component}")
+
+
 @ModelBase.register("HunYuanMoEV1ForCausalLM")
 class HunYuanMoEModel(TextModel):
     model_arch = gguf.MODEL_ARCH.HUNYUAN_MOE

@@ -9003,8 +9003,7 @@ static void ggml_compute_forward_ssm_scan_f32(
     GGML_ASSERT(src4->nb[0] == sizeof(float));
     GGML_ASSERT(src5->nb[0] == sizeof(float));
     GGML_ASSERT(src6->nb[0] == sizeof(int32_t));
-    // allows optimizing the modulo since n_group should be a power of 2
-    GGML_ASSERT((ng & -ng) == ng);
+    GGML_ASSERT(nh % ng == 0);
 
     // heads per thread
     const int dh = (nh + nth - 1)/nth;
@@ -9035,6 +9034,7 @@ static void ggml_compute_forward_ssm_scan_f32(
                     // ref: https://github.com/state-spaces/mamba/blob/62db608da60f6fc790b8ed9f4b3225e95ca15fde/mamba_ssm/ops/triton/softplus.py#L16
                     const float dt_soft_plus = dt[h] <= 20.0f ? log1pf(expf(dt[h])) : dt[h];
                     const float dA = expf(dt_soft_plus * A[h]);
+                    const int g = h / (nh / ng); // repeat_interleave
 
                     // dim
                     for (int i1 = 0; i1 < nr; ++i1) {
@@ -9057,8 +9057,8 @@ static void ggml_compute_forward_ssm_scan_f32(
                             // TODO: maybe unroll more?
                             for (int j = 0; j < 1; j++) {
                                 GGML_F32_VEC t0 = GGML_F32_VEC_LOAD(s0 + i + j*ggml_f32_epr + ii*nc);
-                                GGML_F32_VEC t1 = GGML_F32_VEC_LOAD(B + i + j*ggml_f32_epr + (h & (ng - 1))*nc);
-                                GGML_F32_VEC t2 = GGML_F32_VEC_LOAD(C + i + j*ggml_f32_epr + (h & (ng - 1))*nc);
+                                GGML_F32_VEC t1 = GGML_F32_VEC_LOAD(B + i + j*ggml_f32_epr + g*nc);
+                                GGML_F32_VEC t2 = GGML_F32_VEC_LOAD(C + i + j*ggml_f32_epr + g*nc);
 
                                 t0 = GGML_F32_VEC_MUL(t0, adA);
                                 t1 = GGML_F32_VEC_MUL(t1, axdt);
@@ -9090,8 +9090,8 @@ static void ggml_compute_forward_ssm_scan_f32(
                         for (int i = 0; i < np; i += GGML_F32_STEP) {
                             for (int j = 0; j < GGML_F32_ARR; j++) {
                                 ax[j] = GGML_F32_VEC_LOAD(s0 + i + j*GGML_F32_EPR + ii*nc);
-                                ay[j] = GGML_F32_VEC_LOAD(B + i + j*GGML_F32_EPR + (h & (ng - 1))*nc);
-                                az[j] = GGML_F32_VEC_LOAD(C + i + j*GGML_F32_EPR + (h & (ng - 1))*nc);
+                                ay[j] = GGML_F32_VEC_LOAD(B + i + j*GGML_F32_EPR + g*nc);
+                                az[j] = GGML_F32_VEC_LOAD(C + i + j*GGML_F32_EPR + g*nc);
 
                                 ax[j] = GGML_F32_VEC_MUL(ax[j], adA);
                                 ay[j] = GGML_F32_VEC_MUL(ay[j], axdt);
@@ -9113,7 +9113,7 @@ static void ggml_compute_forward_ssm_scan_f32(
                         // d_state
                         for (int i0 = np; i0 < nc; ++i0) {
                             const int i = i0 + ii*nc;
-                            const int ig = i0 + (h & (ng - 1))*nc;
+                            const int ig = i0 + g*nc;
                             // state = prev_state * dA + dB * x
                             const float state = (s0[i] * dA) + (B[ig] * x_dt);
                             // y = rowwise_dotprod(state, C)
@@ -9130,6 +9130,7 @@ static void ggml_compute_forward_ssm_scan_f32(
                 for (int h = ih0; h < ih1; ++h) {
                     // ref: https://github.com/state-spaces/mamba/blob/62db608da60f6fc790b8ed9f4b3225e95ca15fde/mamba_ssm/ops/triton/softplus.py#L16
                     const float dt_soft_plus = dt[h] <= 20.0f ? log1pf(expf(dt[h])) : dt[h];
+                    const int g = h / (nh / ng); // repeat_interleave
 
                     // dim
                     for (int i1 = 0; i1 < nr; ++i1) {
@@ -9144,8 +9145,8 @@ static void ggml_compute_forward_ssm_scan_f32(
                         // TODO: what happens when (d_state % svcntw()) != 0?
                         for (int64_t k = 0; k < nc; k += svcntw()) {
                             svfloat32_t vA = GGML_F32_VEC_LOAD(&A[h*nc + k]);
-                            svfloat32_t vB = GGML_F32_VEC_LOAD(&B[k + (h & (ng - 1))*nc]);
-                            svfloat32_t vC = GGML_F32_VEC_LOAD(&C[k + (h & (ng - 1))*nc]);
+                            svfloat32_t vB = GGML_F32_VEC_LOAD(&B[k + g*nc]);
+                            svfloat32_t vC = GGML_F32_VEC_LOAD(&C[k + g*nc]);
                             svfloat32_t vs0 = GGML_F32_VEC_LOAD(&s0[ii*nc + k]);
 
                             svfloat32_t t1 = GGML_F32_VEC_MUL(vdt_soft_plus, vA);
@@ -9165,7 +9166,7 @@ static void ggml_compute_forward_ssm_scan_f32(
                         // d_state
                         for (int i0 = 0; i0 < nc; ++i0) {
                             const int i = i0 + ii*nc;
-                            const int ig = i0 + (h & (ng - 1))*nc;
+                            const int ig = i0 + g*nc;
                             // state = prev_state * dA + dB * x
                             const float state = (s0[i] * expf(dt_soft_plus * A[i0 + h*nc])) + (B[ig] * x_dt);
                             // y = rowwise_dotprod(state, C)