@@ -3538,6 +3538,144 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter
35383538 return super ().modify_tensors (data_torch , name , bid )
35393539
35403540
3541+ @ModelBase .register ("Qwen3VLMoeForConditionalGeneration" )
3542+ class Qwen3VLMoeVisionModel (MmprojModel ):
3543+ def __init__ (self , * args , ** kwargs ):
3544+ super ().__init__ (* args , ** kwargs )
3545+ assert self .has_vision_encoder
3546+ assert self .hparams_vision is not None
3547+
3548+ # Compute image_size if not present
3549+ if "image_size" not in self .hparams_vision :
3550+ # For Qwen3VLMoe, compute from num_position_embeddings
3551+ num_pos = self .hparams_vision .get ("num_position_embeddings" , 2304 )
3552+ patch_size = self .hparams_vision .get ("patch_size" , 16 )
3553+ # num_position_embeddings = (image_size / patch_size) ** 2
3554+ # So image_size = sqrt(num_position_embeddings) * patch_size
3555+ import math
3556+ image_size = int (math .sqrt (num_pos ) * patch_size )
3557+ self .hparams_vision ["image_size" ] = image_size
3558+
3559+ # Rename config values for compatibility
3560+ self .hparams_vision ["num_attention_heads" ] = self .hparams_vision .get ("num_heads" )
3561+ self .hparams_vision ["num_hidden_layers" ] = self .hparams_vision .get ("depth" )
3562+
3563+ self .deepstack_layers : list [int ] = list (self .hparams_vision .get ("deepstack_visual_indexes" , []))
3564+
3565+ def set_gguf_parameters (self ):
3566+ super ().set_gguf_parameters ()
3567+ self .gguf_writer .add_clip_projector_type (gguf .VisionProjectorType .QWEN3VL )
3568+
3569+ if self .hparams_vision is not None :
3570+ merge_size = self .hparams_vision .get ("spatial_merge_size" )
3571+ if merge_size is not None :
3572+ self .gguf_writer .add_vision_spatial_merge_size (int (merge_size ))
3573+
3574+ hidden_act = (self .hparams_vision .get ("hidden_act" ) or "" ).lower ()
3575+ if hidden_act :
3576+ if "gelu" in hidden_act :
3577+ self .gguf_writer .add_vision_use_gelu (True )
3578+ elif hidden_act == "silu" :
3579+ self .gguf_writer .add_vision_use_silu (True )
3580+ else :
3581+ raise ValueError (f"Unsupported hidden_act: { hidden_act } " )
3582+
3583+ # Use text config's rms_norm_eps for vision attention layernorm eps (similar to qwen2vl)
3584+ rms_norm_eps = self .global_config .get ("rms_norm_eps" )
3585+ if rms_norm_eps is None :
3586+ # Try text_config
3587+ text_config = self .global_config .get ("text_config" , {})
3588+ rms_norm_eps = text_config .get ("rms_norm_eps" , 1e-6 )
3589+ self .gguf_writer .add_vision_attention_layernorm_eps (rms_norm_eps )
3590+
3591+ if self .deepstack_layers :
3592+ self .gguf_writer .add_vision_deepstack_layers (self .deepstack_layers )
3593+
3594+ def modify_tensors (self , data_torch : Tensor , name : str , bid : int | None ) -> Iterable [tuple [str , Tensor ]]:
3595+ del bid # unused
3596+
3597+ if name .startswith ("model.visual." ):
3598+ name = name .replace ("model.visual." , "visual." , 1 )
3599+
3600+ if name .startswith ("visual.deepstack_merger_list." ):
3601+ prefix , rest = name .split ("." , maxsplit = 3 )[2 :]
3602+ idx = int (prefix )
3603+ target = rest
3604+
3605+ tensor_type : gguf .MODEL_TENSOR
3606+ if target .startswith ("norm." ):
3607+ tensor_type = gguf .MODEL_TENSOR .V_DS_NORM
3608+ suffix = target .split ("." , 1 )[1 ]
3609+ elif target .startswith ("linear_fc1." ):
3610+ tensor_type = gguf .MODEL_TENSOR .V_DS_FC1
3611+ suffix = target .split ("." , 1 )[1 ]
3612+ elif target .startswith ("linear_fc2." ):
3613+ tensor_type = gguf .MODEL_TENSOR .V_DS_FC2
3614+ suffix = target .split ("." , 1 )[1 ]
3615+ else :
3616+ raise ValueError (f"Unexpected deepstack tensor: { name } " )
3617+
3618+ new_name = self .format_tensor_name (tensor_type , idx , suffix = f".{ suffix } " )
3619+ return [(new_name , data_torch )]
3620+
3621+ if name .startswith ("visual.merger." ):
3622+ suffix = name .split ("." , 2 )[2 ]
3623+ if suffix .startswith ("linear_fc" ):
3624+ fc_idx_str , tail = suffix .split ("." , 1 )
3625+ fc_num = int (fc_idx_str .replace ("linear_fc" , "" ))
3626+ # Qwen3VLMoe has linear_fc1 and linear_fc2
3627+ # Map to indices 0 and 2 (matching Qwen2VL which uses indices 0 and 2)
3628+ if fc_num == 1 :
3629+ fc_idx = 0
3630+ elif fc_num == 2 :
3631+ fc_idx = 2
3632+ else :
3633+ raise ValueError (f"unexpected fc index { fc_num } in { name } " )
3634+ new_name = self .format_tensor_name (gguf .MODEL_TENSOR .V_MMPROJ , fc_idx , suffix = f".{ tail } " )
3635+ elif suffix .startswith ("norm." ):
3636+ new_name = self .format_tensor_name (gguf .MODEL_TENSOR .V_POST_NORM , suffix = f".{ suffix .split ('.' , 1 )[1 ]} " )
3637+ else :
3638+ raise ValueError (f"Unexpected merger tensor: { name } " )
3639+ return [(new_name , data_torch )]
3640+
3641+ if name == "visual.patch_embed.proj.weight" :
3642+ # split Conv3D into Conv2Ds along temporal dimension
3643+ c1 , c2 , kt , _ , _ = data_torch .shape
3644+ del c1 , c2
3645+ if kt != 2 :
3646+ raise ValueError ("Current implementation only supports temporal_patch_size of 2" )
3647+ return [
3648+ (gguf .TENSOR_NAMES [gguf .MODEL_TENSOR .V_ENC_EMBD_PATCH ] + ".weight" , data_torch [:, :, 0 , ...]),
3649+ (gguf .TENSOR_NAMES [gguf .MODEL_TENSOR .V_ENC_EMBD_PATCH ] + ".weight.1" , data_torch [:, :, 1 , ...]),
3650+ ]
3651+
3652+ if name == "visual.patch_embed.proj.bias" :
3653+ # Skip bias for Qwen3VL - the C++ code expects it to be null
3654+ return []
3655+
3656+ if name .startswith ("visual." ):
3657+ if ".qkv." in name :
3658+ if data_torch .ndim == 2 :
3659+ c3 , _ = data_torch .shape
3660+ else :
3661+ c3 = data_torch .shape [0 ]
3662+ if c3 % 3 != 0 :
3663+ raise ValueError (f"Unexpected QKV shape for { name } : { data_torch .shape } " )
3664+ c = c3 // 3
3665+ wq = data_torch [:c ]
3666+ wk = data_torch [c : c * 2 ]
3667+ wv = data_torch [c * 2 :]
3668+ base = name .replace ("qkv" , "{placeholder}" )
3669+ return [
3670+ (self .map_tensor_name (base .format (placeholder = "q" )), wq ),
3671+ (self .map_tensor_name (base .format (placeholder = "k" )), wk ),
3672+ (self .map_tensor_name (base .format (placeholder = "v" )), wv ),
3673+ ]
3674+
3675+ return [(self .map_tensor_name (name ), data_torch )]
3676+
3677+ return []
3678+
35413679@ModelBase .register ("InternVisionModel" )
35423680class InternVisionModel (MmprojModel ):
35433681 def set_gguf_parameters (self ):
@@ -3678,7 +3816,43 @@ def set_gguf_parameters(self):
36783816 def modify_tensors (self , data_torch : Tensor , name : str , bid : int | None ) -> Iterable [tuple [str , Tensor ]]:
36793817 # process the experts separately
36803818 name = name .replace ("language_model." , "" ) # InternVL
3681- if name .startswith ("mlp" ) or name .startswith ("vision_model" ) or name .startswith ("model.vision_tower" ) or name .startswith ("model.multi_modal_projector" ):
3819+
3820+ # handle aggregated expert tensors
3821+ # GGUF stores dimensions reversed from PyTorch, so:
3822+ # PyTorch (A,B,C) -> GGUF writes [C,B,A] -> GGML reads ne={C,B,A}
3823+ # Input shapes from HF: (n_expert, n_ff_exp, n_embd) or (n_expert, n_embd, n_ff_exp)
3824+ # Expected GGML ne: {n_embd, n_ff_exp, n_expert} for gate/up, {n_ff_exp, n_embd, n_expert} for down
3825+ if name .endswith ("mlp.experts.down_proj" ) or name .endswith ("mlp.experts.down_proj.weight" ):
3826+ mapped = f"{ name } .weight" if not name .endswith (".weight" ) else name
3827+ # Input: (n_expert=128, n_ff_exp=768, n_embd=2048)
3828+ # Want GGML ne: {n_ff_exp, n_embd, n_expert} = {768, 2048, 128}
3829+ # Need PyTorch: (128, 2048, 768) [reversed of GGML]
3830+ # So: permute(0, 2, 1): (128, 768, 2048) -> (128, 2048, 768)
3831+ permuted = data_torch .permute (0 , 2 , 1 ).contiguous ()
3832+ return [(self .map_tensor_name (mapped ), permuted )]
3833+
3834+ if name .endswith ("mlp.experts.gate_up_proj" ) or name .endswith ("mlp.experts.gate_up_proj.weight" ):
3835+ if data_torch .ndim < 3 or data_torch .shape [- 1 ] % 2 != 0 :
3836+ raise ValueError (f"Unexpected gate_up_proj shape for { name } : { tuple (data_torch .shape )} " )
3837+ split_dim = data_torch .shape [- 1 ] // 2
3838+ gate = data_torch [..., :split_dim ].contiguous ()
3839+ up = data_torch [..., split_dim :].contiguous ()
3840+ # Input gate/up: (n_expert=128, n_embd=2048, n_ff_exp=768)
3841+ # Want GGML ne: {n_embd, n_ff_exp, n_expert} = {2048, 768, 128}
3842+ # Need PyTorch: (128, 768, 2048) [reversed of GGML]
3843+ # So: permute(0, 2, 1): (128, 2048, 768) -> (128, 768, 2048)
3844+ base_name = name .removesuffix (".weight" )
3845+ base = base_name .rsplit ('.' , 1 )[0 ]
3846+ mapped_gate = f"{ base } .gate_proj.weight"
3847+ mapped_up = f"{ base } .up_proj.weight"
3848+ perm_gate = gate .permute (0 , 2 , 1 ).contiguous ()
3849+ perm_up = up .permute (0 , 2 , 1 ).contiguous ()
3850+ return [
3851+ (self .map_tensor_name (mapped_gate ), perm_gate ),
3852+ (self .map_tensor_name (mapped_up ), perm_up ),
3853+ ]
3854+
3855+ if name .startswith ("mlp" ) or name .startswith ("vision_model" ) or name .startswith ("model.vision_tower" ) or name .startswith ("model.multi_modal_projector" ) or name .startswith ("model.visual" ):
36823856 # skip visual tensors
36833857 return []
36843858 if name .find ("experts" ) != - 1 :
@@ -3826,6 +4000,28 @@ def set_vocab(self):
38264000 super ().set_vocab ()
38274001
38284002
4003+ @ModelBase .register ("Qwen3VLMoeForConditionalGeneration" )
4004+ class Qwen3VLMoeTextModel (Qwen3MoeModel ):
4005+ model_arch = gguf .MODEL_ARCH .QWEN3VLMOE
4006+
4007+ def set_gguf_parameters (self ):
4008+ super ().set_gguf_parameters ()
4009+
4010+ # Handle MRoPE (Multi-axis Rotary Position Embedding) for Qwen3-VL
4011+ text_config = self .hparams .get ("text_config" , {})
4012+ rope_scaling = text_config .get ("rope_scaling" ) or {}
4013+
4014+ if rope_scaling .get ("mrope_section" ):
4015+ # mrope_section contains [time, height, width] dimensions
4016+ mrope_section = rope_scaling ["mrope_section" ]
4017+ # Pad to 4 dimensions [time, height, width, extra]
4018+ while len (mrope_section ) < 4 :
4019+ mrope_section .append (0 )
4020+ self .gguf_writer .add_rope_dimension_sections (mrope_section [:4 ])
4021+
4022+ logger .info (f"MRoPE sections: { mrope_section [:4 ]} " )
4023+
4024+
38294025@ModelBase .register ("GPT2LMHeadModel" )
38304026class GPT2Model (TextModel ):
38314027 model_arch = gguf .MODEL_ARCH .GPT2
0 commit comments