@@ -1233,6 +1233,9 @@ def get_vocab_base_pre(self, tokenizer) -> str:
12331233 if chkhsh == "4a2e2abae11ca2b86d570fc5b44be4d5eb5e72cc8f22dd136a94b37da83ab665":
12341234 # ref: https://huggingface.co/KORMo-Team/KORMo-tokenizer
12351235 res = "kormo"
1236+ if chkhsh == "9d70134b369a70e5735009b6de918f7581b5211f7c074d1f89f753aea8248af1":
1237+ # ref: https://huggingface.co/tencent/Youtu-LLM-2B
1238+ res = "youtu"
12361239 if chkhsh == "16389f0a1f51ee53e562ffd51c371dc508639ab0e4261502071836e50e223e91":
12371240 # ref: https://huggingface.co/upstage/Solar-Open-100B
12381241 res = "solar-open"
@@ -7189,6 +7192,7 @@ def prepare_tensors(self):
71897192 "DeepseekV2ForCausalLM",
71907193 "DeepseekV3ForCausalLM",
71917194 "KimiVLForConditionalGeneration",
7195+ "YoutuForCausalLM",
71927196)
71937197class DeepseekV2Model(TextModel):
71947198 model_arch = gguf.MODEL_ARCH.DEEPSEEK2
@@ -7255,7 +7259,15 @@ def set_gguf_parameters(self):
72557259 super().set_gguf_parameters()
72567260 hparams = self.hparams
72577261
7258- self.gguf_writer.add_leading_dense_block_count(hparams["first_k_dense_replace"])
7262+ # first_k_dense_replace: number of leading layers using dense FFN instead of MoE
7263+ # For non-MoE models (like Youtu), set to n_layer to use dense FFN for all layers
7264+ # For MoE models (like DeepSeek-V2), this is the number of leading non-MoE layers
7265+ has_moe = hparams.get("n_routed_experts") is not None
7266+ first_k_dense_replace = hparams.get("first_k_dense_replace")
7267+ if first_k_dense_replace is None:
7268+ # Default: if no MoE, all layers are dense; if MoE, none are dense
7269+ first_k_dense_replace = hparams["num_hidden_layers"] if not has_moe else 0
7270+ self.gguf_writer.add_leading_dense_block_count(first_k_dense_replace)
72597271 self.gguf_writer.add_vocab_size(hparams["vocab_size"])
72607272 if "q_lora_rank" in hparams and hparams["q_lora_rank"] is not None:
72617273 self.gguf_writer.add_q_lora_rank(hparams["q_lora_rank"])
@@ -7267,11 +7279,24 @@ def set_gguf_parameters(self):
72677279 self.gguf_writer.add_key_length_mla(hparams["qk_nope_head_dim"] + hparams["qk_rope_head_dim"])
72687280 self.gguf_writer.add_value_length_mla(hparams["v_head_dim"])
72697281
7270- self.gguf_writer.add_expert_feed_forward_length(hparams["moe_intermediate_size"])
7271- self.gguf_writer.add_expert_count(hparams["n_routed_experts"])
7272- self.gguf_writer.add_expert_shared_count(hparams["n_shared_experts"])
7273- self.gguf_writer.add_expert_weights_scale(hparams["routed_scaling_factor"])
7274- self.gguf_writer.add_expert_weights_norm(hparams["norm_topk_prob"])
7282+ # MoE parameters (required by C++ code for DEEPSEEK2 arch)
7283+ # For non-MoE models like Youtu, use intermediate_size as expert_feed_forward_length
7284+ moe_intermediate_size = self.find_hparam(["moe_intermediate_size", "intermediate_size"], optional=False)
7285+ self.gguf_writer.add_expert_feed_forward_length(moe_intermediate_size)
7286+
7287+ if (n_routed_experts := hparams.get("n_routed_experts")) is not None:
7288+ self.gguf_writer.add_expert_count(n_routed_experts)
7289+
7290+ # expert_shared_count is required by C++ code, default to 0 for non-MoE models
7291+ n_shared_experts = hparams.get("n_shared_experts", 0)
7292+ self.gguf_writer.add_expert_shared_count(n_shared_experts)
7293+
7294+ # When not set, C++ code will use scale_w = false to skip the no-op scaling
7295+ if (routed_scaling_factor := hparams.get("routed_scaling_factor")) is not None:
7296+ self.gguf_writer.add_expert_weights_scale(routed_scaling_factor)
7297+
7298+ if (norm_topk_prob := hparams.get("norm_topk_prob")) is not None and norm_topk_prob:
7299+ self.gguf_writer.add_expert_weights_norm(norm_topk_prob)
72757300
72767301 self.gguf_writer.add_rope_dimension_count(hparams["qk_rope_head_dim"])
72777302
@@ -7287,10 +7312,17 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter
72877312 # skip vision tensors and remove "language_model." for Kimi-VL
72887313 if "vision_tower" in name or "multi_modal_projector" in name:
72897314 return []
7290-
7315+ if name.startswith("siglip2.") or name.startswith("merger."):
7316+ return []
72917317 if name.startswith("language_model."):
72927318 name = name.replace("language_model.", "")
72937319
7320+ # skip lm_head.weight if tie_word_embeddings is True
7321+ if self.hparams.get("tie_word_embeddings", False):
7322+ if name == "lm_head.weight" or name == "model.lm_head.weight":
7323+ logger.info("Skipping tied output layer 'lm_head.weight' (will use token_embd.weight)")
7324+ return []
7325+
72947326 # rename e_score_correction_bias tensors
72957327 if name.endswith("e_score_correction_bias"):
72967328 name = name.replace("e_score_correction_bias", "e_score_correction.bias")
@@ -10625,6 +10657,59 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter
1062510657 return []
1062610658
1062710659
10660+ @ModelBase.register("YOUTUVLForConditionalGeneration", "YOUTUVLForCausalLM")
10661+ class YOUTUVLVisionModel(MmprojModel):
10662+ def __init__(self, *args, **kwargs):
10663+ super().__init__(*args, **kwargs)
10664+ assert self.hparams_vision is not None
10665+ self.hparams_vision["image_size"] = self.hparams_vision.get("image_size", 560)
10666+
10667+ def set_gguf_parameters(self):
10668+ super().set_gguf_parameters()
10669+
10670+ self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.YOUTUVL)
10671+ self.gguf_writer.add_vision_attention_layernorm_eps(self.hparams.get("layer_norm_eps", 1e-6))
10672+
10673+ # Handle activation function
10674+ hidden_act = str(self.hparams.get("hidden_act", "gelu_pytorch_tanh")).lower()
10675+ if hidden_act in ("gelu", "gelu_pytorch_tanh", "gelu_fast", "gelu_new", "gelu_accurate"):
10676+ self.gguf_writer.add_vision_use_gelu(True)
10677+ elif hidden_act == "silu":
10678+ self.gguf_writer.add_vision_use_silu(True)
10679+ else:
10680+ raise ValueError(f"Unsupported activation function for YOUTUVL: {hidden_act}")
10681+
10682+ self.gguf_writer.add_vision_spatial_merge_size(self.hparams.get("spatial_merge_size", 2))
10683+
10684+ window_size = self.hparams.get("window_size")
10685+ if window_size is not None:
10686+ self.gguf_writer.add_vision_window_size(window_size)
10687+ # fullatt_block_indexes contains explicit layer indices that use full attention
10688+ # e.g., [2, 5, 8, 11] means layers 2, 5, 8, 11 use full attention
10689+ # All other layers use window attention
10690+ fullatt_block_indexes = self.hparams.get("fullatt_block_indexes")
10691+ assert fullatt_block_indexes is not None, "fullatt_block_indexes is required for youtuvl"
10692+ # Store the explicit layer indices for YoutuVL (irregular pattern approach)
10693+ self.gguf_writer.add_vision_wa_layer_indexes(layers=fullatt_block_indexes)
10694+
10695+ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
10696+ del bid # unused
10697+
10698+ # Skip language model tensors
10699+ skip_prefixes = ('lm_head.', 'model.layers.', 'model.embed_tokens.', 'model.norm.')
10700+ if name.startswith(skip_prefixes):
10701+ return []
10702+
10703+ # Try to map the tensor using TensorNameMap (handles vision encoder and projector)
10704+ try:
10705+ new_name = self.map_tensor_name(name)
10706+ return [(new_name, data_torch)]
10707+ except ValueError:
10708+ # If mapping fails, log warning and skip
10709+ logger.warning(f"Cannot map tensor: {name}")
10710+ return []
10711+
10712+
1062810713@ModelBase.register("SolarOpenForCausalLM")
1062910714class SolarOpenModel(Glm4MoeModel):
1063010715 model_arch = gguf.MODEL_ARCH.GLM4_MOE
0 commit comments