@@ -1089,6 +1089,8 @@ def __init__(self, *args, **kwargs):
10891089 raise TypeError ("VisionModel must be subclassed with model_arch = gguf.MODEL_ARCH.CLIP_VISION" )
10901090
10911091 # get n_embd of the text model
1092+ if "text_config" not in self .hparams :
1093+ self .hparams ["text_config" ] = {}
10921094 text_config = {** self .hparams , ** self .hparams ["text_config" ]}
10931095 self .n_embd_text = text_config .get ("hidden_size" , text_config .get ("n_embd" , 0 ))
10941096 assert self .n_embd_text > 0 , "n_embd not found in hparams"
@@ -2583,6 +2585,82 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter
25832585 return [(self .map_tensor_name (name ), data_torch )]
25842586
25852587
2588+ @ModelBase .register ("Qwen2VLForConditionalGeneration" , "Qwen2_5_VLForConditionalGeneration" )
2589+ class Qwen2VLVisionModel (VisionModel ):
2590+ def __init__ (self , * args , ** kwargs ):
2591+ super ().__init__ (* args , ** kwargs )
2592+ self .hparams ["image_size" ] = self .hparams .get ("image_size" , 560 )
2593+ # rename config.json values
2594+ self .hparams ["num_attention_heads" ] = self .hparams .get ("num_heads" )
2595+ self .hparams ["num_hidden_layers" ] = self .hparams .get ("depth" )
2596+ if "embed_dim" in self .hparams : # qwen2vl
2597+ self .hparams ["intermediate_size" ] = self .hparams .get ("hidden_size" )
2598+ self .hparams ["hidden_size" ] = self .hparams .get ("embed_dim" )
2599+
2600+ def set_gguf_parameters (self ):
2601+ super ().set_gguf_parameters ()
2602+ hparams = self .hparams
2603+ if self .global_config ['model_type' ] == 'qwen2_vl' :
2604+ self .gguf_writer .add_vision_projector_type (gguf .VisionProjectorType .QWEN2VL )
2605+ elif self .global_config ['model_type' ] == 'qwen2_5_vl' :
2606+ self .gguf_writer .add_vision_projector_type (gguf .VisionProjectorType .QWEN25VL )
2607+ self .gguf_writer .add_vision_use_silu (True )
2608+ # find n_wa_pattern (window attention pattern)
2609+ fullatt_block_indexes = hparams .get ("fullatt_block_indexes" )
2610+ assert fullatt_block_indexes is not None , "fullatt_block_indexes is required for qwen2_5_vl"
2611+ n_wa_pattern = fullatt_block_indexes [0 ] + 1
2612+ # validate n_wa_pattern
2613+ for i in range (1 , len (fullatt_block_indexes )):
2614+ if fullatt_block_indexes [i ] - fullatt_block_indexes [i - 1 ] != n_wa_pattern :
2615+ raise ValueError (f"Invalid fullatt_block_indexes: { fullatt_block_indexes } " )
2616+ self .gguf_writer .add_vision_n_wa_pattern (n_wa_pattern )
2617+ else :
2618+ raise ValueError (f"Unknown QwenVL model type: { self .global_config ['model_type' ]} " )
2619+ # default values below are taken from HF tranformers code
2620+ self .gguf_writer .add_vision_attention_layernorm_eps (self .global_config .get ("rms_norm_eps" , 1e-6 ))
2621+
2622+ def tensor_force_quant (self , name , new_name , bid , n_dims ):
2623+ del bid , name , n_dims # unused
2624+ if ".patch_embd." in new_name :
2625+ return gguf .GGMLQuantizationType .F16
2626+ if ".position_embd." in new_name :
2627+ return gguf .GGMLQuantizationType .F32
2628+ return False
2629+
2630+ def modify_tensors (self , data_torch : Tensor , name : str , bid : int | None ) -> Iterable [tuple [str , Tensor ]]:
2631+ del bid # unused
2632+ if name .startswith ("visual." ):
2633+ # process visual tensors
2634+ # split QKV tensors if needed
2635+ if ".qkv." in name :
2636+ if data_torch .ndim == 2 : # weight
2637+ c3 , _ = data_torch .shape
2638+ else : # bias
2639+ c3 = data_torch .shape [0 ]
2640+ assert c3 % 3 == 0
2641+ c = c3 // 3
2642+ wq = data_torch [:c ]
2643+ wk = data_torch [c : c * 2 ]
2644+ wv = data_torch [c * 2 :]
2645+ return [
2646+ (self .map_tensor_name (name .replace ("qkv" , "q" )), wq ),
2647+ (self .map_tensor_name (name .replace ("qkv" , "k" )), wk ),
2648+ (self .map_tensor_name (name .replace ("qkv" , "v" )), wv ),
2649+ ]
2650+ elif 'patch_embed.proj.weight' in name :
2651+ # split Conv3D into Conv2Ds
2652+ c1 , c2 , kt , kh , kw = data_torch .shape
2653+ del c1 , c2 , kh , kw # unused
2654+ assert kt == 2 , "Current implmentation only support temporal_patch_size of 2"
2655+ return [
2656+ (gguf .TENSOR_NAMES [gguf .MODEL_TENSOR .V_ENC_EMBD_PATCH ] + ".weight" , data_torch [:, :, 0 , ...]),
2657+ (gguf .TENSOR_NAMES [gguf .MODEL_TENSOR .V_ENC_EMBD_PATCH ] + ".weight.1" , data_torch [:, :, 1 , ...]),
2658+ ]
2659+ else :
2660+ return [(self .map_tensor_name (name ), data_torch )]
2661+ return [] # skip other tensors
2662+
2663+
25862664@ModelBase .register ("WavTokenizerDec" )
25872665class WavTokenizerDecModel (TextModel ):
25882666 model_arch = gguf .MODEL_ARCH .WAVTOKENIZER_DEC
0 commit comments