Trying to fix tokenizer

mitmul · mitmul · commit 694b9638c6d5 · 2025-05-31T17:48:31.000+09:00
diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py
@@ -1551,7 +1551,7 @@ def set_vocab(self):
         self.gguf_writer.add_token_list(tokens)
         self.gguf_writer.add_token_types(toktypes)
 
-        special_vocab = gguf.SpecialVocab(dir_model, n_vocab=len(tokens))
+        special_vocab = gguf.SpecialVocab(dir_model, n_vocab=len(tokens), load_merges=True)
         special_vocab.add_to_gguf(self.gguf_writer)
 
     def set_gguf_parameters(self):
@@ -2200,41 +2200,68 @@ def set_vocab(self):
 
         from transformers import AutoTokenizer
         tokenizer = AutoTokenizer.from_pretrained(dir_model, trust_remote_code=True)
-        vocab_size = len(tokenizer.vocab)
-        # Since we are checking the maximum index, we need to ensure it's strictly less than vocab_size,
-        # because vocab_size is the count of items, and indexes start at 0.
+
+        # PLaMo2 has padded vocabulary - get actual size from embedding weight
+        # Load the embedding tensor to get the real vocab size
+        import torch
+        from safetensors import safe_open
+        actual_vocab_size = None
+
+        # Check the model weight files to get actual vocab size
+        weight_map_file = dir_model / "model.safetensors.index.json"
+        if weight_map_file.exists():
+            import json
+            with open(weight_map_file, 'r') as f:
+                weight_map = json.load(f)
+            embed_file = weight_map['weight_map']['model.embed_tokens.weight']
+            embed_path = dir_model / embed_file
+
+            with safe_open(str(embed_path), framework='pt', device='cpu') as f:
+                embed_weight = f.get_tensor('model.embed_tokens.weight')
+                actual_vocab_size = embed_weight.shape[0]
+
+        vocab_size = actual_vocab_size if actual_vocab_size else len(tokenizer.vocab)
+
+        # Since we are checking the maximum index, we need to ensure it's strictly less than tokenizer vocab size,
+        # because PLaMo2 has padded vocabulary
         max_vocab_index = max(tokenizer.get_vocab().values())
-        if max_vocab_index >= vocab_size:
+        if max_vocab_index >= len(tokenizer.vocab):
             raise ValueError("Vocabulary size exceeds expected maximum size.")
         
         reverse_vocab: dict[int, str] = {id_: encoded_tok for encoded_tok, id_ in tokenizer.vocab.items()}
         added_vocab = tokenizer.get_added_vocab()
 
         for token_id in range(vocab_size):
-            token_text = reverse_vocab[token_id].encode('utf-8')
-            # replace "\x00" to string with length > 0
-            if token_text == b"\x00":
-                toktype = gguf.TokenType.BYTE  # special
-                token_text = f"<{token_text}>".encode('utf-8')
-            elif re.fullmatch(br"<0x[0-9A-Fa-f]{2}>", token_text):
-                toktype = gguf.TokenType.BYTE  # special
-            elif reverse_vocab[token_id] in added_vocab:
-                if tokenizer.added_tokens_decoder[token_id].special:
-                    toktype = gguf.TokenType.CONTROL
-                else:
-                    toktype = gguf.TokenType.USER_DEFINED
+            # Handle padding tokens for vocab entries beyond tokenizer vocabulary
+            if token_id >= len(tokenizer.vocab):
+                # Create padding tokens for the extra vocabulary entries
+                token_text = f"<pad_{token_id}>".encode('utf-8')
+                toktype = gguf.TokenType.UNUSED
             else:
-                toktype = gguf.TokenType.NORMAL
+                token_text = reverse_vocab[token_id].encode('utf-8')
+                # replace "\x00" to string with length > 0
+                if token_text == b"\x00":
+                    toktype = gguf.TokenType.BYTE  # special
+                    token_text = f"<{token_text}>".encode('utf-8')
+                elif re.fullmatch(br"<0x[0-9A-Fa-f]{2}>", token_text):
+                    toktype = gguf.TokenType.BYTE  # special
+                elif reverse_vocab[token_id] in added_vocab:
+                    if tokenizer.added_tokens_decoder[token_id].special:
+                        toktype = gguf.TokenType.CONTROL
+                    else:
+                        toktype = gguf.TokenType.USER_DEFINED
+                else:
+                    toktype = gguf.TokenType.NORMAL
 
             tokens.append(token_text)
             toktypes.append(toktype)
 
-        # self.gguf_writer.add_tokenizer_model("llama")
-        # self.gguf_writer.add_tokenizer_pre("default")
+        self.gguf_writer.add_tokenizer_model("llama")
+        self.gguf_writer.add_tokenizer_pre("default")
         self.gguf_writer.add_token_list(tokens)
         self.gguf_writer.add_token_types(toktypes)
 
-        special_vocab = gguf.SpecialVocab(dir_model, n_vocab=len(tokens))
+        special_vocab = gguf.SpecialVocab(dir_model, n_vocab=len(tokens), load_merges=False)
         special_vocab.add_to_gguf(self.gguf_writer)
 
     def set_gguf_parameters(self):
@@ -2245,9 +2272,16 @@ def set_gguf_parameters(self):
         # Mamba parameters
         if hparams.get("mamba_enabled", False):
             self.gguf_writer.add_ssm_conv_kernel(hparams.get("mamba_d_conv", 4))
-            self.gguf_writer.add_ssm_inner_size(hparams.get("mamba_d_state", 64) * hparams.get("intermediate_size", 13312) // hparams.get("hidden_size", 4096))
+            # PLaMo2 SSM inner size = mamba_num_heads * hidden_size_per_head
+            mamba_num_heads = hparams.get("mamba_num_heads", 64)
+            hidden_size_per_head = hparams.get("hidden_size_per_head", 128)
+            ssm_inner_size = mamba_num_heads * hidden_size_per_head
+            self.gguf_writer.add_ssm_inner_size(ssm_inner_size)
             self.gguf_writer.add_ssm_state_size(hparams.get("mamba_d_state", 64))
-            self.gguf_writer.add_ssm_time_step_rank(hparams.get("mamba_d_state", 64) // 16)  # Commonly d_state/16
+            # PLaMo2 dt_dim = max(64, hidden_size // 16)
+            hidden_size = hparams.get("hidden_size", 4096)
+            dt_dim = max(64, hidden_size // 16)
+            self.gguf_writer.add_ssm_time_step_rank(dt_dim)
 
         # Attention window parameters
         if "attention_window_size" in hparams:
@@ -2273,6 +2307,24 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter
             # Reconstruct the name without the duplicate "layers"
             name = f"model.layers.{layer_num}.{rest}"
         
+        # Handle combined gate_up_proj tensor split
+        if name.endswith(".mlp.gate_up_proj.weight"):
+            # Split the combined gate_up tensor into separate gate and up tensors
+            # The tensor shape is (2 * intermediate_size, hidden_size)
+            # Split along dim 0 to get gate (first half) and up (second half)
+            intermediate_size = data_torch.shape[0] // 2
+            gate_weight = data_torch[:intermediate_size, :]
+            up_weight = data_torch[intermediate_size:, :]
+
+            # Map to the correct names
+            gate_name = self.map_tensor_name(name.replace("gate_up_proj", "gate_proj"))
+            up_name = self.map_tensor_name(name.replace("gate_up_proj", "up_proj"))
+
+            return [
+                (gate_name, gate_weight),
+                (up_name, up_weight)
+            ]
+
         # Handle Mamba-specific A_log tensor transformation
         if name.endswith(".A_log"):
             # Map the tensor name first
diff --git a/src/llama-arch.cpp b/src/llama-arch.cpp
@@ -756,15 +756,13 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
             // Attention-specific tensors
             { LLM_TENSOR_ROPE_FREQS,      "rope_freqs" },
             { LLM_TENSOR_ATTN_ROT_EMBD,   "blk.%d.attn_rot_embd" },
-            { LLM_TENSOR_ATTN_Q,          "blk.%d.attn_q" },
-            { LLM_TENSOR_ATTN_K,          "blk.%d.attn_k" },
-            { LLM_TENSOR_ATTN_V,          "blk.%d.attn_v" },
-            { LLM_TENSOR_ATTN_OUT,        "blk.%d.attn_output" },
+            { LLM_TENSOR_ATTN_QKV,          "blk.%d.attn_qkv" },
+            { LLM_TENSOR_ATTN_OUT,          "blk.%d.attn_output" },
 
             // Mamba-specific tensors (SSM)
             { LLM_TENSOR_SSM_IN,          "blk.%d.ssm_in" },
             { LLM_TENSOR_SSM_CONV1D,      "blk.%d.ssm_conv1d" },
-            { LLM_TENSOR_SSM_X,           "blk.%d.ssm_x" },
+            { LLM_TENSOR_SSM_BCDT,          "blk.%d.ssm_bcdt" },
             { LLM_TENSOR_SSM_DT,          "blk.%d.ssm_dt" },
             { LLM_TENSOR_SSM_A,           "blk.%d.ssm_a" },
             { LLM_TENSOR_SSM_D,           "blk.%d.ssm_d" },
diff --git a/src/llama-arch.h b/src/llama-arch.h
@@ -268,6 +268,7 @@ enum llm_tensor {
     LLM_TENSOR_SSM_IN,
     LLM_TENSOR_SSM_CONV1D,
     LLM_TENSOR_SSM_X,
+    LLM_TENSOR_SSM_BCDT,
     LLM_TENSOR_SSM_DT,
     LLM_TENSOR_SSM_A,
     LLM_TENSOR_SSM_D,
diff --git a/src/llama-hparams.h b/src/llama-hparams.h
@@ -7,6 +7,7 @@
 // bump if necessary
 #define LLAMA_MAX_LAYERS  512
 #define LLAMA_MAX_EXPERTS 256  // DeepSeekV3
+#define MAX_LAYER_BLOCK_TYPE_NAME_LEN 64
 
 enum llama_expert_gating_func_type {
     LLAMA_EXPERT_GATING_FUNC_TYPE_NONE    = 0,
@@ -104,10 +105,12 @@ struct llama_hparams {
     llama_swa_type swa_type = LLAMA_SWA_TYPE_NONE;
     // the size of the sliding window (0 - no SWA)
     uint32_t n_swa = 0;
+    uint32_t n_swa_pattern = 0; // sliding window attention pattern
     // if swa_layers[il] == true, then layer il is SWA
     // if swa_layers[il] == false, then layer il is dense (i.e. non-SWA)
     // by default, all layers are dense
     std::array<bool, LLAMA_MAX_LAYERS> swa_layers;
+    std::array<std::array<char, MAX_LAYER_BLOCK_TYPE_NAME_LEN>, LLAMA_MAX_LAYERS> layers_block_type_arr;
 
     // for State Space Models
     uint32_t ssm_d_conv  = 0;
diff --git a/src/llama-model-loader.h b/src/llama-model-loader.h
@@ -5,6 +5,7 @@
 #include "llama-impl.h"
 #include "llama-arch.h"
 #include "llama-mmap.h"
+#include "llama-hparams.h"
 
 #include "ggml-cpp.h"
 
diff --git a/src/llama-model.cpp b/src/llama-model.cpp
diff --git a/src/llama-model.h b/src/llama-model.h