convert_hf_to_gguf.py

jwjohns · jwjohns · commit bfc234d1878e · 2025-08-27T13:33:33.000-04:00
diff --git a/src/llama-model.cpp b/src/llama-model.cpp
@@ -1214,54 +1214,31 @@ void llama_model::load_hparams(llama_model_loader & ml) {
 
                 ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
 
-                // Nemotron-H attention parameters (fixed per public config)
-                hparams.n_embd_head_k = 128;  // attention head size
-                hparams.n_embd_head_v = 128;  // attention head size
-
-                // Try to load layer schedule from GGUF: %s.layer_types (0=SSM,1=ATTN,2=FFN)
-                std::vector<int32_t> layer_types;
-                const bool has_schedule = ml.get_arr(LLM_KV_LAYER_TYPES, layer_types, false) && layer_types.size() == hparams.n_layer;
-                if (has_schedule) {
-                    for (uint32_t i = 0; i < hparams.n_layer; ++i) {
-                        const int32_t t = layer_types[i];
-                        // recurrent layers are SSM
-                        hparams.recurrent_layer_arr[i] = (t == 0);
-                        if (t == 1) {
-                            // attention layer
-                            hparams.n_head_arr[i]    = 40;
-                            hparams.n_head_kv_arr[i] = 8;
-                        } else {
-                            hparams.n_head_arr[i]    = 0;
-                            hparams.n_head_kv_arr[i] = 0;
-                        }
-                    }
-                } else {
-                    // Fallback to the known 9B schedule or set defaults
-                    if (hparams.n_layer == 56) {
-                        std::vector<bool> ssm_layers = {
-                            true, false, true, false, true, false, true, true, false, true,
-                            false, true, false, true, false, false, true, false, true, false,
-                            true, false, false, true, false, true, false, true, false, true,
-                            false, false, true, false, true, false, true, false, true, false,
-                            false, true, false, true, true, false, true, false, true, false,
-                            true, false, true, false, true, false
-                        };
-                        for (uint32_t i = 0; i < hparams.n_layer; ++i) {
-                            hparams.recurrent_layer_arr[i] = ssm_layers[i];
-                            if (i == 14 || i == 21 || i == 30 || i == 39) {
-                                hparams.n_head_arr[i]    = 40;
-                                hparams.n_head_kv_arr[i] = 8;
-                            } else {
-                                hparams.n_head_arr[i]    = 0;
-                                hparams.n_head_kv_arr[i] = 0;
-                            }
+                // Use n_head_kv and n_ff pattern matching for layer detection
+                // n_head_kv == 0 && n_ff == 0 => recurrent/SSM layer
+                // n_head_kv == 0 && n_ff > 0  => MLP layer
+                // n_head_kv > 0 && n_ff == 0  => attention layer
+                for (uint32_t il = 0; il < hparams.n_layer; ++il) {
+                    const auto n_head_kv = hparams.n_head_kv(il);
+                    const auto n_ff = hparams.n_ff(il);
+                    
+                    if (n_head_kv == 0 && n_ff == 0) {
+                        // SSM/recurrent layer
+                        hparams.recurrent_layer_arr[il] = true;
+                    } else if (n_head_kv == 0 && n_ff > 0) {
+                        // MLP layer (non-recurrent)
+                        hparams.recurrent_layer_arr[il] = false;
+                    } else if (n_head_kv > 0) {
+                        // Attention layer (non-recurrent)
+                        hparams.recurrent_layer_arr[il] = false;
+                        // Attention head size is dynamically calculated from n_embd and n_head
+                        if (hparams.n_head(il) > 0) {
+                            hparams.n_embd_head_k = hparams.n_embd / hparams.n_head(il);
+                            hparams.n_embd_head_v = hparams.n_embd / hparams.n_head(il);
                         }
                     } else {
-                        for (uint32_t i = 0; i < hparams.n_layer; ++i) {
-                            hparams.recurrent_layer_arr[i] = true; // default SSM
-                            hparams.n_head_arr[i]    = 0;
-                            hparams.n_head_kv_arr[i] = 0;
-                        }
+                        // Default to SSM for safety
+                        hparams.recurrent_layer_arr[il] = true;
                     }
                 }
 
@@ -3706,8 +3683,8 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
                     const int64_t d_state = hparams.ssm_d_state;
                     const int64_t n_head  = hparams.ssm_dt_rank;
                     const int64_t n_group = hparams.ssm_n_group;
-                    // Use actual dimension from model: 22656 instead of calculated 22608
-                    const int64_t d_in_proj = 22656; // 2*d_inner + 2*n_group*d_state + n_head + 48;
+                    // Calculate d_in_proj dynamically from tensor - will be determined from GGUF
+                    int64_t d_in_proj = 2 * d_inner;  // Default fallback, will be updated from actual tensor
 
                     // only an expansion factor of 2 is supported for now
                     GGML_ASSERT(2 * n_embd == d_inner);
@@ -3751,10 +3728,11 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
             case LLM_ARCH_NEMOTRON_H:
                 {
                     const int64_t d_conv  = hparams.ssm_d_conv;
+                    const int64_t d_inner = hparams.ssm_d_inner;
                     const int64_t d_state = hparams.ssm_d_state;
                     const int64_t n_group = hparams.ssm_n_group;
-                    // Use actual dimension from model: 22656 instead of calculated 22608
-                    const int64_t d_in_proj = 22656;
+                    // Calculate d_in_proj dynamically from tensor - will be determined from GGUF
+                    int64_t d_in_proj = 2 * d_inner;  // Default fallback, will be updated from actual tensor
 
                     tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
 
@@ -11678,15 +11656,89 @@ struct llm_build_jamba : public llm_graph_context_mamba {
 
 struct llm_build_nemotron_h : public llm_graph_context_mamba {
     
-    // Nemotron-H SSM layer - delegate to the Mamba-2 builder
+    // Nemotron-H SSM layer - handle 22656 dimension correctly
     ggml_tensor * build_nemotron_h_ssm_layer(
         llm_graph_input_rs * inp,
                ggml_tensor * cur,
          const llama_model & model,
          const llama_ubatch & ubatch,
                        int   il) const {
-        // Reuse the Mamba-2 implementation which handles FP32 conv + SSM states
-        return build_mamba2_layer(inp, cur, model, ubatch, il);
+        
+        const auto * mctx_cur = inp->mctx;
+        const auto kv_head = mctx_cur->get_head();
+
+        const int64_t d_conv   = hparams.ssm_d_conv;
+        const int64_t d_inner  = hparams.ssm_d_inner;
+        const int64_t d_state  = hparams.ssm_d_state;
+        const int64_t n_heads  = hparams.ssm_dt_rank;
+        const int64_t head_dim = d_inner / n_heads;
+        const int64_t n_group  = hparams.ssm_n_group;
+        const int64_t n_seqs   = ubatch.n_seqs;
+        const int64_t n_seq_tokens = ubatch.n_seq_tokens;
+
+        GGML_ASSERT(n_seqs != 0);
+        GGML_ASSERT(ubatch.equal_seqs());
+        GGML_ASSERT(ubatch.n_tokens == n_seq_tokens * n_seqs);
+
+        ggml_tensor * conv_states_all = mctx_cur->get_r_l(il);
+        ggml_tensor * ssm_states_all  = mctx_cur->get_s_l(il);
+
+        ggml_tensor * conv = build_rs(inp, conv_states_all, hparams.n_embd_r(), n_seqs);
+        conv = ggml_reshape_3d(ctx0, conv, d_conv - 1, d_inner + 2*n_group*d_state, n_seqs);
+
+        // {n_embd, n_tokens} => {n_embd, n_seq_tokens, n_seqs}
+        cur = ggml_reshape_3d(ctx0, cur, cur->ne[0], n_seq_tokens, n_seqs);
+
+        // Calculate actual d_in_proj from tensor dimensions for hybrid compatibility
+        const int64_t actual_d_in_proj = model.layers[il].ssm_in->ne[1];
+        LLAMA_LOG_INFO("Hybrid SSM layer %d: using d_in_proj=%lld (tensor ne[1]=%lld)\n", il, actual_d_in_proj, model.layers[il].ssm_in->ne[1]);
+        
+        // in_proj: {n_embd, d_in_proj} @ {n_embd, n_seq_tokens, n_seqs} => {d_in_proj, n_seq_tokens, n_seqs}
+        ggml_tensor * zx = build_lora_mm(model.layers[il].ssm_in, cur);
+        cb(zx, "hybrid_ssm_in_proj", il);
+
+        // Generic hybrid approach: split tensor based on architectural requirements
+        // Flexible splitting for different hybrid model architectures
+        ggml_tensor * x = ggml_view_3d(ctx0, zx,
+                                      d_inner + 2*n_group*d_state, n_seq_tokens, n_seqs,
+                                      zx->nb[1], zx->nb[2], 0);
+        
+        ggml_tensor * z = ggml_view_3d(ctx0, zx,
+                                      d_inner, n_seq_tokens, n_seqs,
+                                      zx->nb[1], zx->nb[2], 
+                                      (d_inner + 2*n_group*d_state - d_inner) * ggml_element_size(zx));
+
+        // Continue with standard Mamba2 processing
+        // conv1d
+        {
+            // => {d_conv - 1 + n_seq_tokens, d_inner + 2*n_group*d_state, n_seqs}
+            ggml_tensor * conv_x = ggml_concat(ctx0, conv, ggml_transpose(ctx0, x), 0);
+            cb(conv_x, "nemotron_h_conv1d_input", il);
+
+            // copy last (d_conv - 1) columns back into the state cache
+            ggml_tensor * last_conv = ggml_view_3d(ctx0, conv_x, d_conv - 1, d_inner + 2*n_group*d_state, n_seqs, conv_x->nb[1], conv_x->nb[2], n_seq_tokens*(conv_x->nb[0]));
+
+            ggml_build_forward_expand(gf,
+                ggml_cpy(ctx0, last_conv,
+                    ggml_view_1d(ctx0, conv_states_all,
+                        (d_conv - 1)*(d_inner + 2*n_group*d_state)*(n_seqs),
+                        kv_head*(d_conv - 1)*(d_inner + 2*n_group*d_state)*ggml_element_size(conv_states_all))));
+            cb(conv_states_all, "nemotron_h_conv1d_state", il);
+
+            // 1D convolution
+            x = ggml_ssm_conv(ctx0, conv_x, model.layers[il].ssm_conv1d);
+            cb(x, "nemotron_h_conv1d", il);
+
+            // bias
+            x = ggml_add(ctx0, x, model.layers[il].ssm_conv1d_b);
+
+            x = ggml_silu(ctx0, x);
+            cb(x, "nemotron_h_conv1d_silu", il);
+        }
+
+        // Rest of SSM processing (using the existing pattern)
+        // For now, return a simplified result to test the conv layer
+        return ggml_mul(ctx0, x, ggml_silu(ctx0, z));
     }
 
     llm_build_nemotron_h(const llama_model & model, const llm_graph_params & params) : llm_graph_context_mamba(params) {
@@ -11712,10 +11764,10 @@ struct llm_build_nemotron_h : public llm_graph_context_mamba {
                 // Attention layer if KV heads are present (per schedule)
                 const bool is_attention_layer = hparams.n_head_kv(il) > 0;
                 if (is_attention_layer) {
-                    // Attention layer
-                    const int64_t n_embd_head = 128; // Nemotron-H attention head size
+                    // Attention layer - calculate head size dynamically
                     const int64_t n_head = hparams.n_head(il);
                     const int64_t n_head_kv = hparams.n_head_kv(il);
+                    const int64_t n_embd_head = n_head > 0 ? hparams.n_embd / n_head : 128; // Dynamic calculation with fallback
                     
                     struct ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
                     struct ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
@@ -18566,17 +18618,17 @@ llama_memory_i * llama_model::create_memory(const llama_memory_params & params,
                         /* unified           */ cparams.kv_unified,
                         /* filter_attn       */ (arch == LLM_ARCH_FALCON_H1 || arch == LLM_ARCH_NEMOTRON_H) ? 
                                                   [&](int32_t il) { 
-                                                      // For NEMOTRON_H: only allocate cache for attention layers (14, 21, 30, 39)
+                                                      // For NEMOTRON_H: only allocate cache for attention layers (n_head_kv > 0)
                                                       if (arch == LLM_ARCH_NEMOTRON_H) {
-                                                          return (il == 14 || il == 21 || il == 30 || il == 39);
+                                                          return hparams.n_head_kv(il) > 0;
                                                       }
                                                       return true; // FALCON_H1 case
                                                   } : (llama_memory_hybrid::layer_filter_cb)nullptr,
                         /* filter_recr       */ (arch == LLM_ARCH_FALCON_H1 || arch == LLM_ARCH_NEMOTRON_H) ? 
                                                   [&](int32_t il) { 
-                                                      // For NEMOTRON_H: allocate recurrent state for SSM layers (non-attention, non-MLP)
+                                                      // For NEMOTRON_H: allocate recurrent state for SSM layers (n_head_kv == 0 && n_ff == 0)
                                                       if (arch == LLM_ARCH_NEMOTRON_H) {
-                                                          return hparams.is_recurrent(il);
+                                                          return hparams.n_head_kv(il) == 0 && hparams.n_ff(il) == 0;
                                                       }
                                                       return true; // FALCON_H1 case
                                                   } : (llama_memory_hybrid::layer_filter_cb)nullptr);