Model conversion seems succeeded, but tokenizer is not working

mitmul · mitmul · commit 98c0e98bfa49 · 2025-06-01T14:31:55.000+09:00
diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py
@@ -2292,6 +2292,21 @@ def set_gguf_parameters(self):
             # Store which layers use full attention vs sliding window
             # This may need custom handling in llama.cpp
             pass
+        
+        # Set layer block types for PLaMo2 hybrid architecture
+        # PLaMo2 alternates between mamba and attention layers
+        mamba_step = hparams.get("mamba_step", 2)
+        num_layers = hparams.get("num_hidden_layers", 32)
+        
+        layer_types = []
+        for i in range(num_layers):
+            # Based on PLaMo2 architecture: even layers are mamba, odd layers are attention
+            if i % mamba_step == 0:
+                layer_types.append("mamba")
+            else:
+                layer_types.append("attention")
+        
+        self.gguf_writer.add_array("plamo2.layers_block_type", layer_types)
 
     def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
         # Handle Plamo2 specific tensor naming
@@ -2327,22 +2342,101 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter
 
         # Handle Mamba-specific A_log tensor transformation
         if name.endswith(".A_log"):
-            # Map the tensor name first
+            # Map the A_log tensor directly to ssm_a
             new_name = self.map_tensor_name(name)
-            logger.debug(f"A_log --> A ==> {new_name}")
+            # Add .weight suffix if not present
+            if not new_name.endswith(".weight"):
+                new_name += ".weight"
+            logger.debug(f"A_log --> A ==> {new_name}, original shape: {data_torch.shape}")
+            
             # Transform A_log to A: A = -exp(A_log)
             data_torch = -torch.exp(data_torch)
+            
+            # PLaMo2 A_log is shape {d_state} but llama.cpp expects {d_state, d_inner}
+            # Expand the tensor to the correct shape
+            if len(data_torch.shape) == 1:
+                d_state = data_torch.shape[0]  # 64
+                d_inner = 8192  # SSM inner size for PLaMo2
+                
+                # Create tensor with correct shape {d_state, d_inner} = {64, 8192}
+                # Each row of the matrix should contain the same value from the original 1D tensor
+                new_tensor = data_torch.new_zeros((d_state, d_inner))
+                for i in range(d_state):
+                    new_tensor[i, :] = data_torch[i]  # Broadcast the single value across the inner dimension
+                data_torch = new_tensor
+                logger.debug(f"Expanded A tensor from {d_state} to shape: {data_torch.shape}")
+            
+            return [(new_name, data_torch)]
+        
+        # Handle Mamba D tensor - ensure .weight suffix
+        if name.endswith("mixer.D") or name.endswith("ssm.D"):
+            new_name = self.map_tensor_name(name)
+            # Add .weight suffix if not present
+            if not new_name.endswith(".weight"):
+                new_name += ".weight"
+            logger.debug(f"D tensor ==> {new_name}")
             return [(new_name, data_torch)]
         
         # Handle Mamba conv1d tensor shape adjustment
-        if "mixer.conv1d" in name:
+        if "mixer.conv1d" in name or ".ssm.conv1d" in name:
             new_name = self.map_tensor_name(name)
-            # Squeeze the conv1d tensor if needed
-            if len(data_torch.shape) == 4:
+            # For PLaMo2 conv1d tensors, reshape from (kernel_size, 1, d_inner) to (d_inner, kernel_size)
+            if len(data_torch.shape) == 3 and data_torch.shape[1] == 1:
+                # PLaMo2 conv1d is (kernel_size, 1, d_inner), needs to be (d_inner, kernel_size)
+                data_torch = data_torch.squeeze(1).transpose(0, 1)
+            elif len(data_torch.shape) == 4:
+                # For other formats, squeeze and transpose as needed
                 data_torch = data_torch.squeeze()
+                if len(data_torch.shape) == 2:
+                    # If it ends up as (kernel_size, d_inner), transpose to (d_inner, kernel_size)
+                    if data_torch.shape[0] < data_torch.shape[1]:
+                        data_torch = data_torch.transpose(0, 1)
             return [(new_name, data_torch)]
 
-        return super().modify_tensors(data_torch, name, bid)
+        # Handle Mamba ssm_dt tensor shape adjustment
+        if "mixer.dt_proj" in name:
+            new_name = self.map_tensor_name(name)
+            logger.debug(f"Processing dt_proj tensor: {name} -> {new_name}, original shape: {data_torch.shape}")
+            
+            # For PLaMo2 dt_proj tensors, original shape is (64, 256) but llama.cpp expects (256, 8192)
+            # The GGUF writer seems to transpose tensors, so we need to account for that.
+            # We want the final result to be (256, 8192) after GGUF transposition
+            
+            # First transpose from (64, 256) to (256, 64)
+            if len(data_torch.shape) == 2 and data_torch.shape[0] == 64 and data_torch.shape[1] == 256:
+                data_torch = data_torch.transpose(0, 1)  # Now (256, 64)
+                logger.debug(f"Transposed dt_proj to shape: {data_torch.shape}")
+            
+            # Expand the second dimension from 64 to 8192 (ssm_inner_size)
+            if len(data_torch.shape) == 2 and data_torch.shape[1] == 64:
+                # ssm_inner_size should be 8192 for PLaMo2 (64 heads * 128 dim_per_head)
+                expected_inner_size = 8192
+                repeat_factor = expected_inner_size // data_torch.shape[1]
+                data_torch = data_torch.repeat(1, repeat_factor)
+                logger.debug(f"Expanded dt_proj to shape: {data_torch.shape}")
+            
+            # Since GGUF writer might transpose, we need to ensure we get (256, 8192) in the end
+            # If we currently have (256, 8192) and GGUF transposes to (8192, 256), 
+            # we need to pre-transpose to (8192, 256) so GGUF ends up with (256, 8192)
+            if len(data_torch.shape) == 2 and data_torch.shape == torch.Size([256, 8192]):
+                data_torch = data_torch.transpose(0, 1)  # Pre-transpose to (8192, 256)
+                logger.debug(f"Pre-transposed dt_proj for GGUF writer: {data_torch.shape}")
+            
+            return [(new_name, data_torch)]
+
+        # Fix tensor name mappings for PLaMo2 to match llama.cpp expectations
+        result = super().modify_tensors(data_torch, name, bid)
+        fixed_result = []
+        for tensor_name, tensor_data in result:
+            # Map PLaMo2-specific norm tensor names to match llama.cpp expectations
+            if ".attn_norm_2.weight" in tensor_name:
+                tensor_name = tensor_name.replace(".attn_norm_2.weight", ".post_attn_norm.weight")
+            elif ".post_ffw_norm.weight" in tensor_name:
+                tensor_name = tensor_name.replace(".post_ffw_norm.weight", ".post_mlp_norm.weight")
+            
+            fixed_result.append((tensor_name, tensor_data))
+        
+        return fixed_result
 
 
 @ModelBase.register("DeciLMForCausalLM")
diff --git a/src/llama-arch.cpp b/src/llama-arch.cpp
@@ -767,6 +767,12 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
             { LLM_TENSOR_SSM_A,           "blk.%d.ssm_a" },
             { LLM_TENSOR_SSM_D,           "blk.%d.ssm_d" },
             { LLM_TENSOR_SSM_OUT,         "blk.%d.ssm_out" },
+            
+            // PLaMo2-specific SSM norm weights and biases
+            { LLM_TENSOR_SSM_B_NORM,      "blk.%d.ssm_b_norm_weight" },
+            { LLM_TENSOR_SSM_C_NORM,      "blk.%d.ssm_c_norm_weight" },
+            { LLM_TENSOR_SSM_DT_NORM,     "blk.%d.ssm_dt_norm_weight" },
+            { LLM_TENSOR_SSM_DT_BIAS,     "blk.%d.ssm_dt_bias" },
         },
     },
     {
@@ -1650,6 +1656,7 @@ static const std::map<llm_tensor, llm_tensor_info> LLM_TENSOR_INFOS = {
     {LLM_TENSOR_FFN_GATE_INP,               {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
     {LLM_TENSOR_SSM_IN,                     {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
     {LLM_TENSOR_SSM_X,                      {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
+    {LLM_TENSOR_SSM_BCDT,                   {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
     {LLM_TENSOR_SSM_DT,                     {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
     {LLM_TENSOR_SSM_OUT,                    {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
     {LLM_TENSOR_TIME_MIX_W1,                {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
@@ -1674,6 +1681,10 @@ static const std::map<llm_tensor, llm_tensor_info> LLM_TENSOR_INFOS = {
     {LLM_TENSOR_SSM_CONV1D,                 {LLM_TENSOR_LAYER_REPEATING, GGML_OP_SSM_CONV}},
     {LLM_TENSOR_SSM_A,                      {LLM_TENSOR_LAYER_REPEATING, GGML_OP_SSM_SCAN}},
     {LLM_TENSOR_SSM_D,                      {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
+    {LLM_TENSOR_SSM_B_NORM,                 {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
+    {LLM_TENSOR_SSM_C_NORM,                 {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
+    {LLM_TENSOR_SSM_DT_NORM,                {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
+    {LLM_TENSOR_SSM_DT_BIAS,                {LLM_TENSOR_LAYER_REPEATING, GGML_OP_ADD}},
     {LLM_TENSOR_TIME_MIX_LERP_X,            {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
     {LLM_TENSOR_TIME_MIX_LN,                {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
     {LLM_TENSOR_CHANNEL_MIX_LERP_K,         {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
@@ -1696,8 +1707,10 @@ static const std::map<llm_tensor, llm_tensor_info> LLM_TENSOR_INFOS = {
     {LLM_TENSOR_ATTN_NORM_2,                {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
     {LLM_TENSOR_ATTN_OUT_NORM,              {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
     {LLM_TENSOR_ATTN_POST_NORM,             {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
+    {LLM_TENSOR_POST_ATTN_NORM,             {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
     {LLM_TENSOR_FFN_NORM,                   {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
     {LLM_TENSOR_FFN_POST_NORM,              {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
+    {LLM_TENSOR_POST_MLP_NORM,              {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
     {LLM_TENSOR_FFN_NORM_EXPS,              {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
     {LLM_TENSOR_ATTN_Q_NORM,                {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
     {LLM_TENSOR_ATTN_K_NORM,                {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
diff --git a/src/llama-arch.h b/src/llama-arch.h
@@ -273,6 +273,10 @@ enum llm_tensor {
     LLM_TENSOR_SSM_A,
     LLM_TENSOR_SSM_D,
     LLM_TENSOR_SSM_OUT,
+    LLM_TENSOR_SSM_B_NORM,
+    LLM_TENSOR_SSM_C_NORM,
+    LLM_TENSOR_SSM_DT_NORM,
+    LLM_TENSOR_SSM_DT_BIAS,
     LLM_TENSOR_TIME_MIX_W0,
     LLM_TENSOR_TIME_MIX_W1,
     LLM_TENSOR_TIME_MIX_W2,