Skip to content

Commit ced765b

Browse files
f291400ngxsonCISC
authored
model: support youtu-vl model (#18479)
* Support Youtu-VL Model * merge code * fix bug * revert qwen2 code & support rsplit in minja.hpp * update warm info * fix annotation * u * revert minja.hpp * fix * Do not write routed_scaling_factor to gguf when routed_scaling_factor is None * fix expert_weights_scale * LGTM after whitespace fixes * fix * fix * fix * layers to layer_index * enum fix --------- Co-authored-by: Xuan-Son Nguyen <son@huggingface.co> Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com>
1 parent 3ccccc8 commit ced765b

17 files changed

+473
-36
lines changed

convert_hf_to_gguf.py

Lines changed: 92 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1233,6 +1233,9 @@ def get_vocab_base_pre(self, tokenizer) -> str:
12331233
if chkhsh == "4a2e2abae11ca2b86d570fc5b44be4d5eb5e72cc8f22dd136a94b37da83ab665":
12341234
# ref: https://huggingface.co/KORMo-Team/KORMo-tokenizer
12351235
res = "kormo"
1236+
if chkhsh == "9d70134b369a70e5735009b6de918f7581b5211f7c074d1f89f753aea8248af1":
1237+
# ref: https://huggingface.co/tencent/Youtu-LLM-2B
1238+
res = "youtu"
12361239
if chkhsh == "16389f0a1f51ee53e562ffd51c371dc508639ab0e4261502071836e50e223e91":
12371240
# ref: https://huggingface.co/upstage/Solar-Open-100B
12381241
res = "solar-open"
@@ -7189,6 +7192,7 @@ def prepare_tensors(self):
71897192
"DeepseekV2ForCausalLM",
71907193
"DeepseekV3ForCausalLM",
71917194
"KimiVLForConditionalGeneration",
7195+
"YoutuForCausalLM",
71927196
)
71937197
class DeepseekV2Model(TextModel):
71947198
model_arch = gguf.MODEL_ARCH.DEEPSEEK2
@@ -7255,7 +7259,15 @@ def set_gguf_parameters(self):
72557259
super().set_gguf_parameters()
72567260
hparams = self.hparams
72577261

7258-
self.gguf_writer.add_leading_dense_block_count(hparams["first_k_dense_replace"])
7262+
# first_k_dense_replace: number of leading layers using dense FFN instead of MoE
7263+
# For non-MoE models (like Youtu), set to n_layer to use dense FFN for all layers
7264+
# For MoE models (like DeepSeek-V2), this is the number of leading non-MoE layers
7265+
has_moe = hparams.get("n_routed_experts") is not None
7266+
first_k_dense_replace = hparams.get("first_k_dense_replace")
7267+
if first_k_dense_replace is None:
7268+
# Default: if no MoE, all layers are dense; if MoE, none are dense
7269+
first_k_dense_replace = hparams["num_hidden_layers"] if not has_moe else 0
7270+
self.gguf_writer.add_leading_dense_block_count(first_k_dense_replace)
72597271
self.gguf_writer.add_vocab_size(hparams["vocab_size"])
72607272
if "q_lora_rank" in hparams and hparams["q_lora_rank"] is not None:
72617273
self.gguf_writer.add_q_lora_rank(hparams["q_lora_rank"])
@@ -7267,11 +7279,24 @@ def set_gguf_parameters(self):
72677279
self.gguf_writer.add_key_length_mla(hparams["qk_nope_head_dim"] + hparams["qk_rope_head_dim"])
72687280
self.gguf_writer.add_value_length_mla(hparams["v_head_dim"])
72697281

7270-
self.gguf_writer.add_expert_feed_forward_length(hparams["moe_intermediate_size"])
7271-
self.gguf_writer.add_expert_count(hparams["n_routed_experts"])
7272-
self.gguf_writer.add_expert_shared_count(hparams["n_shared_experts"])
7273-
self.gguf_writer.add_expert_weights_scale(hparams["routed_scaling_factor"])
7274-
self.gguf_writer.add_expert_weights_norm(hparams["norm_topk_prob"])
7282+
# MoE parameters (required by C++ code for DEEPSEEK2 arch)
7283+
# For non-MoE models like Youtu, use intermediate_size as expert_feed_forward_length
7284+
moe_intermediate_size = self.find_hparam(["moe_intermediate_size", "intermediate_size"], optional=False)
7285+
self.gguf_writer.add_expert_feed_forward_length(moe_intermediate_size)
7286+
7287+
if (n_routed_experts := hparams.get("n_routed_experts")) is not None:
7288+
self.gguf_writer.add_expert_count(n_routed_experts)
7289+
7290+
# expert_shared_count is required by C++ code, default to 0 for non-MoE models
7291+
n_shared_experts = hparams.get("n_shared_experts", 0)
7292+
self.gguf_writer.add_expert_shared_count(n_shared_experts)
7293+
7294+
# When not set, C++ code will use scale_w = false to skip the no-op scaling
7295+
if (routed_scaling_factor := hparams.get("routed_scaling_factor")) is not None:
7296+
self.gguf_writer.add_expert_weights_scale(routed_scaling_factor)
7297+
7298+
if (norm_topk_prob := hparams.get("norm_topk_prob")) is not None and norm_topk_prob:
7299+
self.gguf_writer.add_expert_weights_norm(norm_topk_prob)
72757300

72767301
self.gguf_writer.add_rope_dimension_count(hparams["qk_rope_head_dim"])
72777302

@@ -7287,10 +7312,17 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter
72877312
# skip vision tensors and remove "language_model." for Kimi-VL
72887313
if "vision_tower" in name or "multi_modal_projector" in name:
72897314
return []
7290-
7315+
if name.startswith("siglip2.") or name.startswith("merger."):
7316+
return []
72917317
if name.startswith("language_model."):
72927318
name = name.replace("language_model.", "")
72937319

7320+
# skip lm_head.weight if tie_word_embeddings is True
7321+
if self.hparams.get("tie_word_embeddings", False):
7322+
if name == "lm_head.weight" or name == "model.lm_head.weight":
7323+
logger.info("Skipping tied output layer 'lm_head.weight' (will use token_embd.weight)")
7324+
return []
7325+
72947326
# rename e_score_correction_bias tensors
72957327
if name.endswith("e_score_correction_bias"):
72967328
name = name.replace("e_score_correction_bias", "e_score_correction.bias")
@@ -10625,6 +10657,59 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter
1062510657
return []
1062610658

1062710659

10660+
@ModelBase.register("YOUTUVLForConditionalGeneration", "YOUTUVLForCausalLM")
10661+
class YOUTUVLVisionModel(MmprojModel):
10662+
def __init__(self, *args, **kwargs):
10663+
super().__init__(*args, **kwargs)
10664+
assert self.hparams_vision is not None
10665+
self.hparams_vision["image_size"] = self.hparams_vision.get("image_size", 560)
10666+
10667+
def set_gguf_parameters(self):
10668+
super().set_gguf_parameters()
10669+
10670+
self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.YOUTUVL)
10671+
self.gguf_writer.add_vision_attention_layernorm_eps(self.hparams.get("layer_norm_eps", 1e-6))
10672+
10673+
# Handle activation function
10674+
hidden_act = str(self.hparams.get("hidden_act", "gelu_pytorch_tanh")).lower()
10675+
if hidden_act in ("gelu", "gelu_pytorch_tanh", "gelu_fast", "gelu_new", "gelu_accurate"):
10676+
self.gguf_writer.add_vision_use_gelu(True)
10677+
elif hidden_act == "silu":
10678+
self.gguf_writer.add_vision_use_silu(True)
10679+
else:
10680+
raise ValueError(f"Unsupported activation function for YOUTUVL: {hidden_act}")
10681+
10682+
self.gguf_writer.add_vision_spatial_merge_size(self.hparams.get("spatial_merge_size", 2))
10683+
10684+
window_size = self.hparams.get("window_size")
10685+
if window_size is not None:
10686+
self.gguf_writer.add_vision_window_size(window_size)
10687+
# fullatt_block_indexes contains explicit layer indices that use full attention
10688+
# e.g., [2, 5, 8, 11] means layers 2, 5, 8, 11 use full attention
10689+
# All other layers use window attention
10690+
fullatt_block_indexes = self.hparams.get("fullatt_block_indexes")
10691+
assert fullatt_block_indexes is not None, "fullatt_block_indexes is required for youtuvl"
10692+
# Store the explicit layer indices for YoutuVL (irregular pattern approach)
10693+
self.gguf_writer.add_vision_wa_layer_indexes(layers=fullatt_block_indexes)
10694+
10695+
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
10696+
del bid # unused
10697+
10698+
# Skip language model tensors
10699+
skip_prefixes = ('lm_head.', 'model.layers.', 'model.embed_tokens.', 'model.norm.')
10700+
if name.startswith(skip_prefixes):
10701+
return []
10702+
10703+
# Try to map the tensor using TensorNameMap (handles vision encoder and projector)
10704+
try:
10705+
new_name = self.map_tensor_name(name)
10706+
return [(new_name, data_torch)]
10707+
except ValueError:
10708+
# If mapping fails, log warning and skip
10709+
logger.warning(f"Cannot map tensor: {name}")
10710+
return []
10711+
10712+
1062810713
@ModelBase.register("SolarOpenForCausalLM")
1062910714
class SolarOpenModel(Glm4MoeModel):
1063010715
model_arch = gguf.MODEL_ARCH.GLM4_MOE

convert_hf_to_gguf_update.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -145,6 +145,7 @@ class TOKENIZER_TYPE(IntEnum):
145145
{"name": "granite-docling", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/ibm-granite/granite-docling-258M", },
146146
{"name": "minimax-m2", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/MiniMaxAI/MiniMax-M2", },
147147
{"name": "kormo", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/KORMo-Team/KORMo-tokenizer", },
148+
{"name": "youtu", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/tencent/Youtu-LLM-2B", },
148149
{"name": "solar-open", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/upstage/Solar-Open-100B", },
149150
]
150151

gguf-py/gguf/constants.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -294,7 +294,9 @@ class ClipVision:
294294
USE_GELU = "clip.use_gelu"
295295
USE_SILU = "clip.use_silu"
296296
N_WA_PATTERN = "clip.vision.n_wa_pattern" # used by qwen2.5vl
297+
WA_LAYER_INDEXES = "clip.vision.wa_layer_indexes" # used by youtuvl
297298
IS_DEEPSTACK_LAYERS = "clip.vision.is_deepstack_layers"
299+
WINDOW_SIZE = "clip.vision.window_size"
298300

299301
class Attention:
300302
HEAD_COUNT = "clip.vision.attention.head_count"
@@ -3494,6 +3496,7 @@ class VisionProjectorType:
34943496
LFM2A = "lfm2a" # audio
34953497
MUSIC_FLAMINGO = "musicflamingo" # audio
34963498
GLM4V = "glm4v"
3499+
YOUTUVL = "youtuvl"
34973500

34983501

34993502
# Items here are (block size, type size)

gguf-py/gguf/gguf_writer.py

Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1129,11 +1129,40 @@ def add_vision_projector_scale_factor(self, value: int) -> None:
11291129
self.add_uint32(Keys.ClipVision.Projector.SCALE_FACTOR, value)
11301130

11311131
def add_vision_n_wa_pattern(self, value: int) -> None:
1132+
"""Add window attention pattern interval for vision models.
1133+
1134+
This defines the pattern interval for window attention vs full attention layers.
1135+
For example, if n_wa_pattern=4, then layers 3, 7, 11, ... use full attention,
1136+
while other layers use window attention.
1137+
1138+
Used by models like Qwen2.5-VL where full attention layers follow a regular pattern.
1139+
"""
11321140
self.add_uint32(Keys.ClipVision.N_WA_PATTERN, value)
11331141

1142+
def add_vision_wa_layer_indexes(self, layers: Sequence[int]) -> None:
1143+
"""Add explicit layer indexes that use full attention in vision models.
1144+
1145+
This specifies the exact layer indices (0-based) that should use full attention
1146+
instead of window attention. All other layers will use window attention.
1147+
1148+
Args:
1149+
layers: List of layer indices that use full attention (e.g., [3, 7, 11, 15])
1150+
1151+
Used by models like YoutuVL where full attention layers are explicitly specified
1152+
rather than following a regular pattern.
1153+
1154+
Difference from add_vision_n_wa_pattern:
1155+
- n_wa_pattern: Defines a regular interval pattern (every Nth layer uses full attention)
1156+
- wa_layer_indexes: Explicitly lists which layers use full attention (irregular pattern)
1157+
"""
1158+
self.add_array(Keys.ClipVision.WA_LAYER_INDEXES, layers)
1159+
11341160
def add_vision_is_deepstack_layers(self, layers: Sequence[bool]) -> None:
11351161
self.add_array(Keys.ClipVision.IS_DEEPSTACK_LAYERS, layers)
11361162

1163+
def add_vision_window_size(self, value: int) -> None:
1164+
self.add_uint32(Keys.ClipVision.WINDOW_SIZE, value)
1165+
11371166
# audio models
11381167

11391168
def add_audio_projection_dim(self, value: int) -> None:

gguf-py/gguf/tensor_mapping.py

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1221,6 +1221,7 @@ class TensorNameMap:
12211221
MODEL_TENSOR.V_MMPROJ: (
12221222
"multi_modal_projector.linear_{bid}",
12231223
"visual.merger.mlp.{bid}", # qwen2vl
1224+
"merger.mlp.{bid}",
12241225
),
12251226

12261227
MODEL_TENSOR.V_MMPROJ_FC: (
@@ -1258,6 +1259,7 @@ class TensorNameMap:
12581259
"visual.patch_embed.proj", # qwen2vl
12591260
"vision_tower.patch_embed.proj", # kimi-vl
12601261
"model.vision.patch_embedding.proj", # cogvlm
1262+
"siglip2.vision_model.embeddings.patch_embedding",
12611263
),
12621264

12631265
MODEL_TENSOR.V_ENC_EMBD_NORM: (
@@ -1291,6 +1293,7 @@ class TensorNameMap:
12911293
"vision_encoder.transformer.layers.{bid}.attention.wq", # pixtral
12921294
"visual.blocks.{bid}.attn.q", # qwen2vl, generated
12931295
"vision_tower.encoder.blocks.{bid}.wq", # kimi-vl, generated
1296+
"siglip2.vision_model.encoder.layers.{bid}.self_attn.q_proj", # youtuvl
12941297
),
12951298

12961299
MODEL_TENSOR.V_ENC_ATTN_Q_NORM: (
@@ -1308,6 +1311,7 @@ class TensorNameMap:
13081311
"vision_encoder.transformer.layers.{bid}.attention.wk", # pixtral
13091312
"visual.blocks.{bid}.attn.k", # qwen2vl, generated
13101313
"vision_tower.encoder.blocks.{bid}.wk", # kimi-vl, generated
1314+
"siglip2.vision_model.encoder.layers.{bid}.self_attn.k_proj",
13111315
),
13121316

13131317
MODEL_TENSOR.V_ENC_ATTN_K_NORM: (
@@ -1325,6 +1329,7 @@ class TensorNameMap:
13251329
"vision_encoder.transformer.layers.{bid}.attention.wv", # pixtral
13261330
"visual.blocks.{bid}.attn.v", # qwen2vl, generated
13271331
"vision_tower.encoder.blocks.{bid}.wv", # kimi-vl, generated
1332+
"siglip2.vision_model.encoder.layers.{bid}.self_attn.v_proj",
13281333
),
13291334

13301335
MODEL_TENSOR.V_ENC_INPUT_NORM: (
@@ -1339,6 +1344,7 @@ class TensorNameMap:
13391344
"visual.blocks.{bid}.norm1", # qwen2vl
13401345
"vision_tower.encoder.blocks.{bid}.norm0", # kimi-vl (norm0/norm1)
13411346
"model.vision.transformer.layers.{bid}.input_layernorm", # cogvlm
1347+
"siglip2.vision_model.encoder.layers.{bid}.layer_norm1",
13421348
),
13431349

13441350
MODEL_TENSOR.V_ENC_ATTN_O: (
@@ -1354,6 +1360,7 @@ class TensorNameMap:
13541360
"visual.blocks.{bid}.attn.proj", # qwen2vl
13551361
"vision_tower.encoder.blocks.{bid}.wo", # kimi-vl
13561362
"model.vision.transformer.layers.{bid}.attention.dense", # cogvlm
1363+
"siglip2.vision_model.encoder.layers.{bid}.self_attn.out_proj", # youtuvl
13571364
),
13581365

13591366
MODEL_TENSOR.V_ENC_POST_ATTN_NORM: (
@@ -1368,6 +1375,7 @@ class TensorNameMap:
13681375
"visual.blocks.{bid}.norm2", # qwen2vl
13691376
"vision_tower.encoder.blocks.{bid}.norm1", # kimi-vl (norm0/norm1)
13701377
"model.vision.transformer.layers.{bid}.post_attention_layernorm", # cogvlm
1378+
"siglip2.vision_model.encoder.layers.{bid}.layer_norm2",
13711379
),
13721380

13731381
MODEL_TENSOR.V_ENC_FFN_UP: (
@@ -1383,6 +1391,7 @@ class TensorNameMap:
13831391
"visual.blocks.{bid}.mlp.linear_fc1", # qwen3vl
13841392
"vision_tower.encoder.blocks.{bid}.mlp.fc0", # kimi-vl (fc0/fc1)
13851393
"model.vision.transformer.layers.{bid}.mlp.fc1", # cogvlm
1394+
"siglip2.vision_model.encoder.layers.{bid}.mlp.fc1",
13861395
),
13871396

13881397
MODEL_TENSOR.V_ENC_FFN_GATE: (
@@ -1404,6 +1413,7 @@ class TensorNameMap:
14041413
"visual.blocks.{bid}.mlp.linear_fc2", # qwen3vl
14051414
"vision_tower.encoder.blocks.{bid}.mlp.fc1", # kimi-vl (fc0/fc1)
14061415
"model.vision.transformer.layers.{bid}.mlp.fc2", # cogvlm
1416+
"siglip2.vision_model.encoder.layers.{bid}.mlp.fc2",
14071417
),
14081418

14091419
MODEL_TENSOR.V_LAYER_SCALE_1: (
@@ -1430,6 +1440,7 @@ class TensorNameMap:
14301440
"visual.merger.ln_q", # qwen2vl
14311441
"vision_tower.encoder.final_layernorm", # kimi-vl
14321442
"visual.post_layernorm", # glm4v
1443+
"siglip2.vision_model.post_layernorm",
14331444
),
14341445

14351446
MODEL_TENSOR.V_MM_POST_NORM: (
@@ -1446,6 +1457,7 @@ class TensorNameMap:
14461457
"multi_modal_projector.pre_norm",
14471458
"pre_mm_projector_norm",
14481459
"model.vision.linear_proj.norm1", # cogvlm
1460+
"merger.ln_q",
14491461
),
14501462

14511463
MODEL_TENSOR.V_MM_SOFT_EMB_NORM: (

src/llama-model.cpp

Lines changed: 11 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1683,7 +1683,7 @@ void llama_model::load_hparams(llama_model_loader & ml) {
16831683
ml.get_key(LLM_KV_ATTENTION_VALUE_LENGTH_MLA, hparams.n_embd_head_v_mla, false);
16841684
ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp);
16851685
ml.get_key(LLM_KV_EXPERT_SHARED_COUNT, hparams.n_expert_shared);
1686-
ml.get_key(LLM_KV_EXPERT_WEIGHTS_SCALE, hparams.expert_weights_scale);
1686+
ml.get_key(LLM_KV_EXPERT_WEIGHTS_SCALE, hparams.expert_weights_scale, false);
16871687
ml.get_key(LLM_KV_EXPERT_WEIGHTS_NORM, hparams.expert_weights_norm, false);
16881688
ml.get_key(LLM_KV_EXPERT_GATING_FUNC, hparams.expert_gating_func, false);
16891689
if (hparams.expert_gating_func == LLAMA_EXPERT_GATING_FUNC_TYPE_NONE) {
@@ -4785,7 +4785,11 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
47854785

47864786
// output
47874787
output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
4788-
output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
4788+
// try to load output.weight, if not found, use token_embd (tied embeddings)
4789+
output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
4790+
if (!output) {
4791+
output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
4792+
}
47894793

47904794
for (int i = 0; i < n_layer; ++i) {
47914795
auto & layer = layers[i];
@@ -4848,7 +4852,11 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
48484852

48494853
// output
48504854
output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
4851-
output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
4855+
// try to load output.weight, if not found, use token_embd (tied embeddings)
4856+
output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
4857+
if (!output) {
4858+
output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
4859+
}
48524860

48534861
for (int i = 0; i < n_layer; ++i) {
48544862
auto & layer = layers[i];

src/llama-vocab.cpp

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -314,6 +314,12 @@ struct llm_tokenizer_bpe : llm_tokenizer {
314314
"[!\"#$%&'()*+,\\-./:;<=>?@\\[\\\\\\]^_`{|}~][A-Za-z]+|[^\r\n\\p{L}\\p{P}\\p{S}]?[\\p{L}\\p{M}]+| ?[\\p{P}\\p{S}]+[\r\n]*|\\s*[\r\n]+|\\s+(?!\\S)|\\s+",
315315
};
316316
break;
317+
case LLAMA_VOCAB_PRE_TYPE_YOUTU:
318+
regex_exprs = {
319+
"[가-힣ㄱ-ㆎ]+|[!…“”‘’—:;,、-〿︰-﹏]+|[ㄅ-ㄯ]+|[一-龥぀-ゟ゠-ヿ]+",
320+
"[^\\r\\n\\p{L}\\p{N}]?[\\p{Lu}\\p{Lt}\\p{Lm}\\p{Lo}\\p{M}]*[\\p{Ll}\\p{Lm}\\p{Lo}\\p{M}]+(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])?|[^\\r\\n\\p{L}\\p{N}]?[\\p{Lu}\\p{Lt}\\p{Lm}\\p{Lo}\\p{M}]+[\\p{Ll}\\p{Lm}\\p{Lo}\\p{M}]*(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])?|\\p{N}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n/]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
321+
};
322+
break;
317323
case LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_CODER:
318324
regex_exprs = {
319325
"[\r\n]",
@@ -1861,6 +1867,11 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
18611867
tokenizer_pre == "deepseek-v3") {
18621868
pre_type = LLAMA_VOCAB_PRE_TYPE_DEEPSEEK3_LLM;
18631869
clean_spaces = false;
1870+
} else if (
1871+
tokenizer_pre == "youtu") {
1872+
pre_type = LLAMA_VOCAB_PRE_TYPE_YOUTU;
1873+
clean_spaces = false;
1874+
ignore_merges = true;
18641875
} else if (
18651876
tokenizer_pre == "falcon") {
18661877
pre_type = LLAMA_VOCAB_PRE_TYPE_FALCON;

src/llama-vocab.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -52,6 +52,7 @@ enum llama_vocab_pre_type {
5252
LLAMA_VOCAB_PRE_TYPE_MINIMAX_M2 = 41,
5353
LLAMA_VOCAB_PRE_TYPE_AFMOE = 42,
5454
LLAMA_VOCAB_PRE_TYPE_SOLAR_OPEN = 43,
55+
LLAMA_VOCAB_PRE_TYPE_YOUTU = 44,
5556
};
5657

5758
struct LLM_KV;

src/models/deepseek2.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -215,7 +215,7 @@ llm_build_deepseek2::llm_build_deepseek2(const llama_model & model, const llm_gr
215215
model.layers[il].ffn_exp_probs_b,
216216
n_expert, n_expert_used,
217217
LLM_FFN_SILU, hparams.expert_weights_norm,
218-
true, hparams.expert_weights_scale,
218+
hparams.expert_weights_scale, hparams.expert_weights_scale,
219219
(llama_expert_gating_func_type) hparams.expert_gating_func,
220220
il);
221221
cb(moe_out, "ffn_moe_out", il);

0 commit comments

Comments
 (0)