@@ -3751,14 +3751,10 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
37513751 case LLM_ARCH_NEMOTRON_H:
37523752 {
37533753 const int64_t d_conv = hparams.ssm_d_conv;
3754- // Nemotron-H uses 12288 for conv1d tensors, not the standard 15680
3755- const int64_t d_inner = 12288; // Override: actual conv1d tensor dimension
37563754 const int64_t d_state = hparams.ssm_d_state;
3757- const int64_t n_head = hparams.ssm_dt_rank;
37583755 const int64_t n_group = hparams.ssm_n_group;
37593756 // Use actual dimension from model: 22656 instead of calculated 22608
3760- const int64_t d_in_proj = 22656; // 2*d_inner + 2*n_group*d_state + n_head + 48;
3761- const int64_t d_x_part = d_inner + 2*n_group*d_state; // x1 + B + C
3757+ const int64_t d_in_proj = 22656;
37623758
37633759 tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
37643760
@@ -11688,8 +11684,7 @@ struct llm_build_nemotron_h : public llm_graph_context_mamba {
1168811684 ggml_tensor * cur,
1168911685 const llama_model & model,
1169011686 const llama_ubatch & ubatch,
11691- int il,
11692- ggml_cgraph * gf) const {
11687+ int il) const {
1169311688 // Reuse the Mamba-2 implementation which handles FP32 conv + SSM states
1169411689 return build_mamba2_layer(inp, cur, model, ubatch, il);
1169511690 }
@@ -11712,7 +11707,7 @@ struct llm_build_nemotron_h : public llm_graph_context_mamba {
1171211707 // Nemotron-H hybrid layer logic based on schedule
1171311708 if (hparams.is_recurrent(il)) {
1171411709 // SSM/Mamba layer - use Nemotron-H specific implementation
11715- cur = build_nemotron_h_ssm_layer(inp_hybrid->get_recr(), cur, model, ubatch, il, gf );
11710+ cur = build_nemotron_h_ssm_layer(inp_hybrid->get_recr(), cur, model, ubatch, il);
1171611711 } else {
1171711712 // Attention layer if KV heads are present (per schedule)
1171811713 const bool is_attention_layer = hparams.n_head_kv(il) > 0;
0 commit comments