Skip to content

Commit 694b963

Browse files
committed
Trying to fix tokenizer
1 parent 8bea4f8 commit 694b963

File tree

7 files changed

+301
-36
lines changed

7 files changed

+301
-36
lines changed

convert_hf_to_gguf.py

Lines changed: 75 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -1551,7 +1551,7 @@ def set_vocab(self):
15511551
self.gguf_writer.add_token_list(tokens)
15521552
self.gguf_writer.add_token_types(toktypes)
15531553

1554-
special_vocab = gguf.SpecialVocab(dir_model, n_vocab=len(tokens))
1554+
special_vocab = gguf.SpecialVocab(dir_model, n_vocab=len(tokens), load_merges=True)
15551555
special_vocab.add_to_gguf(self.gguf_writer)
15561556

15571557
def set_gguf_parameters(self):
@@ -2200,41 +2200,68 @@ def set_vocab(self):
22002200

22012201
from transformers import AutoTokenizer
22022202
tokenizer = AutoTokenizer.from_pretrained(dir_model, trust_remote_code=True)
2203-
vocab_size = len(tokenizer.vocab)
2204-
# Since we are checking the maximum index, we need to ensure it's strictly less than vocab_size,
2205-
# because vocab_size is the count of items, and indexes start at 0.
2203+
2204+
# PLaMo2 has padded vocabulary - get actual size from embedding weight
2205+
# Load the embedding tensor to get the real vocab size
2206+
import torch
2207+
from safetensors import safe_open
2208+
actual_vocab_size = None
2209+
2210+
# Check the model weight files to get actual vocab size
2211+
weight_map_file = dir_model / "model.safetensors.index.json"
2212+
if weight_map_file.exists():
2213+
import json
2214+
with open(weight_map_file, 'r') as f:
2215+
weight_map = json.load(f)
2216+
embed_file = weight_map['weight_map']['model.embed_tokens.weight']
2217+
embed_path = dir_model / embed_file
2218+
2219+
with safe_open(str(embed_path), framework='pt', device='cpu') as f:
2220+
embed_weight = f.get_tensor('model.embed_tokens.weight')
2221+
actual_vocab_size = embed_weight.shape[0]
2222+
2223+
vocab_size = actual_vocab_size if actual_vocab_size else len(tokenizer.vocab)
2224+
2225+
# Since we are checking the maximum index, we need to ensure it's strictly less than tokenizer vocab size,
2226+
# because PLaMo2 has padded vocabulary
22062227
max_vocab_index = max(tokenizer.get_vocab().values())
2207-
if max_vocab_index >= vocab_size:
2228+
if max_vocab_index >= len(tokenizer.vocab):
22082229
raise ValueError("Vocabulary size exceeds expected maximum size.")
22092230

22102231
reverse_vocab: dict[int, str] = {id_: encoded_tok for encoded_tok, id_ in tokenizer.vocab.items()}
22112232
added_vocab = tokenizer.get_added_vocab()
22122233

22132234
for token_id in range(vocab_size):
2214-
token_text = reverse_vocab[token_id].encode('utf-8')
2215-
# replace "\x00" to string with length > 0
2216-
if token_text == b"\x00":
2217-
toktype = gguf.TokenType.BYTE # special
2218-
token_text = f"<{token_text}>".encode('utf-8')
2219-
elif re.fullmatch(br"<0x[0-9A-Fa-f]{2}>", token_text):
2220-
toktype = gguf.TokenType.BYTE # special
2221-
elif reverse_vocab[token_id] in added_vocab:
2222-
if tokenizer.added_tokens_decoder[token_id].special:
2223-
toktype = gguf.TokenType.CONTROL
2224-
else:
2225-
toktype = gguf.TokenType.USER_DEFINED
2235+
# Handle padding tokens for vocab entries beyond tokenizer vocabulary
2236+
if token_id >= len(tokenizer.vocab):
2237+
# Create padding tokens for the extra vocabulary entries
2238+
token_text = f"<pad_{token_id}>".encode('utf-8')
2239+
toktype = gguf.TokenType.UNUSED
22262240
else:
2227-
toktype = gguf.TokenType.NORMAL
2241+
token_text = reverse_vocab[token_id].encode('utf-8')
2242+
# replace "\x00" to string with length > 0
2243+
if token_text == b"\x00":
2244+
toktype = gguf.TokenType.BYTE # special
2245+
token_text = f"<{token_text}>".encode('utf-8')
2246+
elif re.fullmatch(br"<0x[0-9A-Fa-f]{2}>", token_text):
2247+
toktype = gguf.TokenType.BYTE # special
2248+
elif reverse_vocab[token_id] in added_vocab:
2249+
if tokenizer.added_tokens_decoder[token_id].special:
2250+
toktype = gguf.TokenType.CONTROL
2251+
else:
2252+
toktype = gguf.TokenType.USER_DEFINED
2253+
else:
2254+
toktype = gguf.TokenType.NORMAL
22282255

22292256
tokens.append(token_text)
22302257
toktypes.append(toktype)
22312258

2232-
# self.gguf_writer.add_tokenizer_model("llama")
2233-
# self.gguf_writer.add_tokenizer_pre("default")
2259+
self.gguf_writer.add_tokenizer_model("llama")
2260+
self.gguf_writer.add_tokenizer_pre("default")
22342261
self.gguf_writer.add_token_list(tokens)
22352262
self.gguf_writer.add_token_types(toktypes)
22362263

2237-
special_vocab = gguf.SpecialVocab(dir_model, n_vocab=len(tokens))
2264+
special_vocab = gguf.SpecialVocab(dir_model, n_vocab=len(tokens), load_merges=False)
22382265
special_vocab.add_to_gguf(self.gguf_writer)
22392266

22402267
def set_gguf_parameters(self):
@@ -2245,9 +2272,16 @@ def set_gguf_parameters(self):
22452272
# Mamba parameters
22462273
if hparams.get("mamba_enabled", False):
22472274
self.gguf_writer.add_ssm_conv_kernel(hparams.get("mamba_d_conv", 4))
2248-
self.gguf_writer.add_ssm_inner_size(hparams.get("mamba_d_state", 64) * hparams.get("intermediate_size", 13312) // hparams.get("hidden_size", 4096))
2275+
# PLaMo2 SSM inner size = mamba_num_heads * hidden_size_per_head
2276+
mamba_num_heads = hparams.get("mamba_num_heads", 64)
2277+
hidden_size_per_head = hparams.get("hidden_size_per_head", 128)
2278+
ssm_inner_size = mamba_num_heads * hidden_size_per_head
2279+
self.gguf_writer.add_ssm_inner_size(ssm_inner_size)
22492280
self.gguf_writer.add_ssm_state_size(hparams.get("mamba_d_state", 64))
2250-
self.gguf_writer.add_ssm_time_step_rank(hparams.get("mamba_d_state", 64) // 16) # Commonly d_state/16
2281+
# PLaMo2 dt_dim = max(64, hidden_size // 16)
2282+
hidden_size = hparams.get("hidden_size", 4096)
2283+
dt_dim = max(64, hidden_size // 16)
2284+
self.gguf_writer.add_ssm_time_step_rank(dt_dim)
22512285

22522286
# Attention window parameters
22532287
if "attention_window_size" in hparams:
@@ -2273,6 +2307,24 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter
22732307
# Reconstruct the name without the duplicate "layers"
22742308
name = f"model.layers.{layer_num}.{rest}"
22752309

2310+
# Handle combined gate_up_proj tensor split
2311+
if name.endswith(".mlp.gate_up_proj.weight"):
2312+
# Split the combined gate_up tensor into separate gate and up tensors
2313+
# The tensor shape is (2 * intermediate_size, hidden_size)
2314+
# Split along dim 0 to get gate (first half) and up (second half)
2315+
intermediate_size = data_torch.shape[0] // 2
2316+
gate_weight = data_torch[:intermediate_size, :]
2317+
up_weight = data_torch[intermediate_size:, :]
2318+
2319+
# Map to the correct names
2320+
gate_name = self.map_tensor_name(name.replace("gate_up_proj", "gate_proj"))
2321+
up_name = self.map_tensor_name(name.replace("gate_up_proj", "up_proj"))
2322+
2323+
return [
2324+
(gate_name, gate_weight),
2325+
(up_name, up_weight)
2326+
]
2327+
22762328
# Handle Mamba-specific A_log tensor transformation
22772329
if name.endswith(".A_log"):
22782330
# Map the tensor name first

src/llama-arch.cpp

Lines changed: 3 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -756,15 +756,13 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
756756
// Attention-specific tensors
757757
{ LLM_TENSOR_ROPE_FREQS, "rope_freqs" },
758758
{ LLM_TENSOR_ATTN_ROT_EMBD, "blk.%d.attn_rot_embd" },
759-
{ LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
760-
{ LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
761-
{ LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
762-
{ LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
759+
{ LLM_TENSOR_ATTN_QKV, "blk.%d.attn_qkv" },
760+
{ LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
763761

764762
// Mamba-specific tensors (SSM)
765763
{ LLM_TENSOR_SSM_IN, "blk.%d.ssm_in" },
766764
{ LLM_TENSOR_SSM_CONV1D, "blk.%d.ssm_conv1d" },
767-
{ LLM_TENSOR_SSM_X, "blk.%d.ssm_x" },
765+
{ LLM_TENSOR_SSM_BCDT, "blk.%d.ssm_bcdt" },
768766
{ LLM_TENSOR_SSM_DT, "blk.%d.ssm_dt" },
769767
{ LLM_TENSOR_SSM_A, "blk.%d.ssm_a" },
770768
{ LLM_TENSOR_SSM_D, "blk.%d.ssm_d" },

src/llama-arch.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -268,6 +268,7 @@ enum llm_tensor {
268268
LLM_TENSOR_SSM_IN,
269269
LLM_TENSOR_SSM_CONV1D,
270270
LLM_TENSOR_SSM_X,
271+
LLM_TENSOR_SSM_BCDT,
271272
LLM_TENSOR_SSM_DT,
272273
LLM_TENSOR_SSM_A,
273274
LLM_TENSOR_SSM_D,

src/llama-hparams.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77
// bump if necessary
88
#define LLAMA_MAX_LAYERS 512
99
#define LLAMA_MAX_EXPERTS 256 // DeepSeekV3
10+
#define MAX_LAYER_BLOCK_TYPE_NAME_LEN 64
1011

1112
enum llama_expert_gating_func_type {
1213
LLAMA_EXPERT_GATING_FUNC_TYPE_NONE = 0,
@@ -104,10 +105,12 @@ struct llama_hparams {
104105
llama_swa_type swa_type = LLAMA_SWA_TYPE_NONE;
105106
// the size of the sliding window (0 - no SWA)
106107
uint32_t n_swa = 0;
108+
uint32_t n_swa_pattern = 0; // sliding window attention pattern
107109
// if swa_layers[il] == true, then layer il is SWA
108110
// if swa_layers[il] == false, then layer il is dense (i.e. non-SWA)
109111
// by default, all layers are dense
110112
std::array<bool, LLAMA_MAX_LAYERS> swa_layers;
113+
std::array<std::array<char, MAX_LAYER_BLOCK_TYPE_NAME_LEN>, LLAMA_MAX_LAYERS> layers_block_type_arr;
111114

112115
// for State Space Models
113116
uint32_t ssm_d_conv = 0;

src/llama-model-loader.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
#include "llama-impl.h"
66
#include "llama-arch.h"
77
#include "llama-mmap.h"
8+
#include "llama-hparams.h"
89

910
#include "ggml-cpp.h"
1011

0 commit comments

Comments
 (0)