still isnt working though progress is being made

jwjohns · jwjohns · commit cc9b9297ff34 · 2025-08-25T09:21:47.000-04:00
diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py
@@ -7982,15 +7982,21 @@ def modify_tensors(self, data_torch, name, bid):
                     # Special handling for conv1d: reshape from 3D to 2D
                     if "conv1d.weight" in layer_component and len(data_torch.shape) == 3:
                         data_torch = data_torch.squeeze(1)  # Remove middle dimension: {4,1,12288} -> {4,12288}
-                    # A_log -> A = -exp(A_log) and reshape from [128,1,1,1] to [1,128]
+                    # A_log -> A = -exp(A_log) and ensure [1,128] shape for llama.cpp
                     if layer_component.endswith("A_log"):
                         data_torch = -torch.exp(data_torch)
-                        if len(data_torch.shape) == 4 and data_torch.shape[1:] == (1, 1, 1):
-                            data_torch = data_torch.reshape(1, data_torch.shape[0])  # [128,1,1,1] -> [1,128]
-                    # D tensor also needs reshaping from [128,1,1,1] to [1,128]
+                        # Ensure 2D shape [1, d_state] for llama.cpp compatibility
+                        if len(data_torch.shape) == 1:
+                            data_torch = data_torch.unsqueeze(-1)  # [128] -> [128,1] -> store as [1,128] in GGUF
+                        elif len(data_torch.shape) == 4 and data_torch.shape[1:] == (1, 1, 1):
+                            data_torch = data_torch.reshape(data_torch.shape[0], 1)  # [128,1,1,1] -> [128,1]
+                    # D tensor also needs reshaping to [1,128] for llama.cpp  
                     if layer_component.endswith("D"):
-                        if len(data_torch.shape) == 4 and data_torch.shape[1:] == (1, 1, 1):
-                            data_torch = data_torch.reshape(1, data_torch.shape[0])  # [128,1,1,1] -> [1,128]
+                        # Ensure 2D shape [1, d_state] for llama.cpp compatibility
+                        if len(data_torch.shape) == 1:
+                            data_torch = data_torch.unsqueeze(-1)  # [128] -> [128,1] -> store as [1,128] in GGUF
+                        elif len(data_torch.shape) == 4 and data_torch.shape[1:] == (1, 1, 1):
+                            data_torch = data_torch.reshape(data_torch.shape[0], 1)  # [128,1,1,1] -> [128,1]
                     # Grouped RMSNorm reshape to [actual_size/n_group, n_group]
                     if layer_component == "mixer.norm.weight":
                         actual_size = data_torch.numel()
diff --git a/src/llama-model.cpp b/src/llama-model.cpp
@@ -3798,10 +3798,14 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
                             layer.ssm_a = create_tensor(tn(LLM_TENSOR_SSM_A, i), {1, d_state}, 0);
                             layer.ssm_d = create_tensor(tn(LLM_TENSOR_SSM_D, i), {1, d_state}, 0);
 
-                            // grouped RMSNorm for the SSM inner stream
-                            layer.ssm_norm = create_tensor(tn(LLM_TENSOR_SSM_NORM, "weight", i), {d_inner / n_group, n_group}, 0);
-                            // out_proj back to model dim
-                            layer.ssm_out  = create_tensor(tn(LLM_TENSOR_SSM_OUT, "weight", i), {d_inner, n_embd}, 0);
+                            // grouped RMSNorm for the SSM inner stream (actual tensor size is 10240 not d_inner)
+                            // Nemotron-H norm tensor: 10240 elements reshaped to [1280, 8]
+                            const int64_t norm_elements_per_group = 1280;  // 10240 / 8
+                            layer.ssm_norm = create_tensor(tn(LLM_TENSOR_SSM_NORM, "weight", i), {norm_elements_per_group, n_group}, 0);
+                            // out_proj back to model dim (actual tensor is [4480, 10240] not [15680, 4480])
+                            // Nemotron-H out_proj: 10240 -> 4480 (not d_inner -> n_embd)
+                            const int64_t out_proj_input_dim = 10240;  // Actual SSM output dim
+                            layer.ssm_out  = create_tensor(tn(LLM_TENSOR_SSM_OUT, "weight", i), {out_proj_input_dim, n_embd}, 0);
                         } else if (is_attention_layer) {
                             // Attention layer tensors - compute from heads and head dim
                             const int64_t n_head_i    = 40; // q heads