diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index 6358a94e9b55f..46631959f132d 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -2236,6 +2236,29 @@ def get_tensors(self) -> Iterator[tuple[str, Tensor]]: continue yield name, data +# Additional handling for 2.5 seems not to be needed. +@Model.register("Qwen2_5_VLForConditionalGeneration") +class Qwen2_5_VLModel(Model): + model_arch = gguf.MODEL_ARCH.QWEN2VL + + def set_gguf_parameters(self): + super().set_gguf_parameters() + mrope_section = self.hparams["rope_scaling"]["mrope_section"] + mrope_section += [0] * max(0, 4 - len(mrope_section)) + self.gguf_writer.add_rope_dimension_sections(mrope_section) + + def set_vocab(self): + try: + self._set_vocab_sentencepiece() + except FileNotFoundError: + self._set_vocab_gpt2() + + def get_tensors(self) -> Iterator[tuple[str, Tensor]]: + for name, data in super().get_tensors(): + if name.startswith("visual."): + continue + yield name, data + @Model.register("WavTokenizerDec") class WavTokenizerDecModel(Model): diff --git a/examples/llava/clip.cpp b/examples/llava/clip.cpp index 76d4a78520575..b45ad5622e840 100644 --- a/examples/llava/clip.cpp +++ b/examples/llava/clip.cpp @@ -9,25 +9,25 @@ #include "ggml-backend.h" #include "gguf.h" -//#ifdef GGML_USE_CUDA -//#include "ggml-cuda.h" -//#endif -// -//#ifdef GGML_USE_SYCL -//#include "ggml-sycl.h" -//#endif -// -//#ifdef GGML_USE_METAL -//#include "ggml-metal.h" -//#endif -// -//#ifdef GGML_USE_CANN -//#include "ggml-cann.h" -//#endif -// -//#ifdef GGML_USE_VULKAN -//#include "ggml-vulkan.h" -//#endif +#ifdef GGML_USE_CUDA +#include "ggml-cuda.h" +#endif + +#ifdef GGML_USE_SYCL +#include "ggml-sycl.h" +#endif + +#ifdef GGML_USE_METAL +#include "ggml-metal.h" +#endif + +#ifdef GGML_USE_CANN +#include "ggml-cann.h" +#endif + +#ifdef GGML_USE_VULKAN +#include "ggml-vulkan.h" +#endif #define STB_IMAGE_IMPLEMENTATION #include "stb_image.h" @@ -106,6 +106,8 @@ static std::string format(const char * fmt, ...) { #define KEY_HAS_GLM_PROJ "clip.has_glm_projector" #define KEY_MINICPMV_VERSION "clip.minicpmv_version" #define KEY_HAS_QWEN2VL_MERGER "clip.has_qwen2vl_merger" +#define KEY_IS_QWEN2_5 "clip.is_qwen2_5" +#define KEY_RMS_NORM_EPS "clip.%s.attention.rms_norm_epsilon" #define KEY_USE_GELU "clip.use_gelu" #define KEY_USE_SILU "clip.use_silu" #define KEY_N_EMBD "clip.%s.embedding_length" @@ -583,6 +585,7 @@ struct clip_ctx { bool has_minicpmv_projector = false; bool has_glm_projector = false; bool has_qwen2vl_merger = false; + bool is_qwen2_5 = false; int minicpmv_version = 2; struct clip_vision_model vision_model; @@ -734,7 +737,10 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32 if (ctx->has_minicpmv_projector) { int pos_w = image_size_width/patch_size; int pos_h = image_size_height/patch_size; - if (ctx->minicpmv_version == 2) { + if (ctx->is_qwen2_5) { + pos_embed = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, 2048, pos_w * pos_h, 1); + } + else if (ctx->minicpmv_version == 2) { pos_embed = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, 4096, pos_w * pos_h, 1); } else if (ctx->minicpmv_version == 3) { @@ -774,8 +780,14 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32 { cur = ggml_norm(ctx0, cur, eps); - cur = ggml_add(ctx0, ggml_mul(ctx0, cur, model.layers[il].ln_1_w), - model.layers[il].ln_1_b); + if (ctx->is_qwen2_5) { + // RMSNorm for Qwen2.5 (no bias) + cur = ggml_mul(ctx0, cur, model.layers[il].ln_1_w); + } else { + // Standard LayerNorm with bias + cur = ggml_add(ctx0, ggml_mul(ctx0, cur, model.layers[il].ln_1_w), + model.layers[il].ln_1_b); + } } // self-attention @@ -834,22 +846,47 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32 { cur = ggml_norm(ctx0, cur, eps); - cur = ggml_add(ctx0, ggml_mul(ctx0, cur, model.layers[il].ln_2_w), model.layers[il].ln_2_b); + if (ctx->is_qwen2_5) { + // RMSNorm for Qwen2.5 (no bias) + cur = ggml_mul(ctx0, cur, model.layers[il].ln_2_w); + } else { + // Standard LayerNorm with bias + cur = ggml_add(ctx0, ggml_mul(ctx0, cur, model.layers[il].ln_2_w), + model.layers[il].ln_2_b); + } } - cur = ggml_mul_mat(ctx0, model.layers[il].ff_i_w, cur); - cur = ggml_add(ctx0, cur, model.layers[il].ff_i_b); + // For Qwen2.5, the MLP uses SiLU gated activation + if (ctx->is_qwen2_5) { + // Qwen2.5 uses SiLU gated activation + // ffn_down is the gate_proj, ffn_up is the up_proj + struct ggml_tensor * gate = ggml_mul_mat(ctx0, model.layers[il].ff_i_w, cur); + struct ggml_tensor * up = ggml_mul_mat(ctx0, model.layers[il].ff_i_b, cur); // using ff_i_b as up_proj weight + + // Apply SiLU to the gate + gate = ggml_silu_inplace(ctx0, gate); + + // Multiply gate and up + cur = ggml_mul(ctx0, gate, up); - if (ctx->use_gelu) { - cur = ggml_gelu_inplace(ctx0, cur); - } else if (ctx->use_silu) { - cur = ggml_silu_inplace(ctx0, cur); + // Apply down projection + cur = ggml_mul_mat(ctx0, model.layers[il].ff_o_w, cur); } else { - cur = ggml_gelu_quick_inplace(ctx0, cur); - } + // Original MLP + cur = ggml_mul_mat(ctx0, model.layers[il].ff_i_w, cur); + cur = ggml_add(ctx0, cur, model.layers[il].ff_i_b); + + if (ctx->use_gelu) { + cur = ggml_gelu_inplace(ctx0, cur); + } else if (ctx->use_silu) { + cur = ggml_silu_inplace(ctx0, cur); + } else { + cur = ggml_gelu_quick_inplace(ctx0, cur); + } - cur = ggml_mul_mat(ctx0, model.layers[il].ff_o_w, cur); - cur = ggml_add(ctx0, cur, model.layers[il].ff_o_b); + cur = ggml_mul_mat(ctx0, model.layers[il].ff_o_w, cur); + cur = ggml_add(ctx0, cur, model.layers[il].ff_o_b); + } // residual 2 cur = ggml_add(ctx0, embeddings, cur); @@ -1085,7 +1122,12 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32 const int d_head = 128; int n_head = hidden_size/d_head; int num_query = 96; - if (ctx->minicpmv_version == 2) { + if (ctx->is_qwen2_5) { + hidden_size = 2048; + n_head = hidden_size/d_head; + num_query = 64; + } + else if (ctx->minicpmv_version == 2) { hidden_size = 4096; n_head = hidden_size/d_head; num_query = 96; @@ -1296,30 +1338,30 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) { } } -//#ifdef GGML_USE_CUDA -// new_clip->backend = ggml_backend_cuda_init(0); -// LOG_INF("%s: CLIP using CUDA backend\n", __func__); -//#endif -// -//#ifdef GGML_USE_METAL -// new_clip->backend = ggml_backend_metal_init(); -// LOG_INF("%s: CLIP using Metal backend\n", __func__); -//#endif -// -//#ifdef GGML_USE_CANN -// new_clip->backend = ggml_backend_cann_init(0); -// LOG_INF("%s: CLIP using CANN backend\n", __func__); -//#endif -// -//#ifdef GGML_USE_VULKAN -// new_clip->backend = ggml_backend_vk_init(0); -// LOG_INF("%s: CLIP using Vulkan backend\n", __func__); -//#endif -// -//#ifdef GGML_USE_SYCL -// new_clip->backend = ggml_backend_sycl_init(0); -// LOG_INF("%s: CLIP using SYCL backend\n", __func__); -//#endif +#ifdef GGML_USE_CUDA + new_clip->backend = ggml_backend_cuda_init(0); + LOG_INF("%s: CLIP using CUDA backend\n", __func__); +#endif + +#ifdef GGML_USE_METAL + new_clip->backend = ggml_backend_metal_init(); + LOG_INF("%s: CLIP using Metal backend\n", __func__); +#endif + +#ifdef GGML_USE_CANN + new_clip->backend = ggml_backend_cann_init(0); + LOG_INF("%s: CLIP using CANN backend\n", __func__); +#endif + +#ifdef GGML_USE_VULKAN + new_clip->backend = ggml_backend_vk_init(0); + LOG_INF("%s: CLIP using Vulkan backend\n", __func__); +#endif + +#ifdef GGML_USE_SYCL + new_clip->backend = ggml_backend_sycl_init(0); + LOG_INF("%s: CLIP using SYCL backend\n", __func__); +#endif if (!new_clip->backend) { new_clip->backend = ggml_backend_cpu_init(); @@ -1360,6 +1402,11 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) { } // GGML_ASSERT(new_clip->has_llava_projector); // see monatis/clip.cpp for image and/or text encoding for semantic search + idx = gguf_find_key(ctx, KEY_IS_QWEN2_5); + if (idx != -1) { + new_clip->is_qwen2_5 = gguf_get_val_bool(ctx, idx); + } + GGML_ASSERT(new_clip->has_vision_encoder); GGML_ASSERT(!new_clip->has_text_encoder); @@ -2942,7 +2989,10 @@ int clip_n_mmproj_embd(const struct clip_ctx * ctx) { return ctx->vision_model.mm_3_b->ne[0]; } if (ctx->proj_type == PROJECTOR_TYPE_RESAMPLER) { - if (ctx->minicpmv_version == 2) { + if (ctx->is_qwen2_5) { + return 2048; + } + else if (ctx->minicpmv_version == 2) { return 4096; } else if (ctx->minicpmv_version == 3) { @@ -2956,6 +3006,11 @@ int clip_n_mmproj_embd(const struct clip_ctx * ctx) { return ctx->vision_model.mm_model_mlp_3_w->ne[1]; } if (ctx->proj_type == PROJECTOR_TYPE_MERGER) { + // For Qwen2.5, the output dimension is 2048 instead of 3584 + if (ctx->is_qwen2_5) { + LOG_INF("%s: Qwen2.5 detected, using output dimension 2048\n", __func__); + return 2048; + } return ctx->vision_model.mm_1_b->ne[0]; } @@ -2976,6 +3031,9 @@ bool clip_is_glm(const struct clip_ctx * ctx) { bool clip_is_qwen2vl(const struct clip_ctx * ctx) { return ctx->has_qwen2vl_merger; } +bool clip_is_qwen2_5vl(const struct clip_ctx * ctx) { + return ctx->is_qwen2_5; +} // Determine the number of encoder layers to iterate over int get_deepest_feature_layer(const struct clip_ctx * ctx) { diff --git a/examples/llava/clip.h b/examples/llava/clip.h index 002c419653a01..ba852b5bf0501 100644 --- a/examples/llava/clip.h +++ b/examples/llava/clip.h @@ -98,6 +98,7 @@ CLIP_API bool clip_model_quantize(const char * fname_inp, const char * fname_out CLIP_API int clip_is_minicpmv(const struct clip_ctx * ctx); CLIP_API bool clip_is_glm(const struct clip_ctx * ctx); CLIP_API bool clip_is_qwen2vl(const struct clip_ctx * ctx); +CLIP_API bool clip_is_qwen2_5vl(const struct clip_ctx * ctx); CLIP_API int get_deepest_feature_layer(const struct clip_ctx * ctx); diff --git a/examples/llava/qwen2_5_vl_surgery.py b/examples/llava/qwen2_5_vl_surgery.py new file mode 100644 index 0000000000000..8d93df2af36f6 --- /dev/null +++ b/examples/llava/qwen2_5_vl_surgery.py @@ -0,0 +1,225 @@ +import argparse +from typing import Dict + +import torch +from gguf import * +from transformers import ( + Qwen2_5_VLForConditionalGeneration, + Qwen2_5_VLProcessor, +) + +VISION = "clip.vision" + + +def k(raw_key: str, arch: str) -> str: + return raw_key.format(arch=arch) + + +def to_gguf_name(name: str) -> str: + og = name + name = name.replace("text_model", "t").replace("visual", "v") + name = name.replace("blocks", "blk").replace("embeddings.", "") + name = name.replace("attn.", "attn_") + + # Handle new Qwen2.5 MLP structure + if "mlp.gate_proj" in name: + name = name.replace("mlp.gate_proj", "ffn_gate") + elif "mlp.up_proj" in name: + name = name.replace("mlp.up_proj", "ffn_up") + elif "mlp.down_proj" in name: + name = name.replace("mlp.down_proj", "ffn_down") + else: + name = name.replace("mlp.fc1", "ffn_down").replace("mlp.fc2", "ffn_up") + + name = name.replace("proj.", "out.") + name = name.replace("norm1", "ln1").replace("norm2", "ln2") + name = name.replace("merger.mlp", 'mm') + + # For RMSNorm, which doesn't have bias + if "weight_g" in name: + name = name.replace("weight_g", "weight") + + # Special handling for merger tensors to match clip.cpp expectations + if "merger.mlp" in name: + # Extract the layer number + parts = name.split(".") + for i, part in enumerate(parts): + if part == "mlp" and i + 1 < len(parts): + layer_num = parts[i + 1] + # Map the merger layers to the expected GGUF tensor names + # Note: clip.cpp looks for mm.0.* and mm.2.* (not mm.1.*) + if layer_num == "0": + name = name.replace(f"merger.mlp.{layer_num}", "mm.0") + elif layer_num == "1": + name = name.replace(f"merger.mlp.{layer_num}", "mm.2") + break + + print(f"[to_gguf_name] {og} --> {name}") + return name + + +def find_vision_tensors(model, dtype, hidden_size) -> Dict[str, np.ndarray]: + visual = model.visual + tensor_map = {} + + for name, ten in visual.state_dict().items(): + ten = ten.numpy() + if 'qkv' in name: + if ten.ndim == 2: # weight + c3, _ = ten.shape + else: # bias + c3 = ten.shape[0] + assert c3 % 3 == 0 + c = c3 // 3 + wq = ten[:c] + wk = ten[c: c * 2] + wv = ten[c * 2:] + tensor_map[to_gguf_name(f"visual.{name}").replace("qkv", "q")] = wq + tensor_map[to_gguf_name(f"visual.{name}").replace("qkv", "k")] = wk + tensor_map[to_gguf_name(f"visual.{name}").replace("qkv", "v")] = wv + elif 'merger' in name: + if name.endswith("ln_q.weight_g"): + tensor_map['v.post_ln.weight'] = ten + elif name.endswith("ln_q.bias") and 'weight_g' not in name: + tensor_map['v.post_ln.bias'] = ten + else: + # Handle merger tensors with special attention to naming + # First, determine if this is a layer 0 or layer 1 tensor + if "merger.mlp.0" in name: + # First layer gets mapped to mm.0.* + if "weight" in name: + tensor_map["mm.0.weight"] = ten + elif "bias" in name: + tensor_map["mm.0.bias"] = ten + elif "merger.mlp.1" in name: + # Second layer gets mapped to mm.2.* (not mm.1.*) + if "weight" in name: + tensor_map["mm.2.weight"] = ten + elif "bias" in name: + tensor_map["mm.2.bias"] = ten + else: + # For any other tensors, use the standard naming conversion + tensor_map[to_gguf_name(name)] = ten + elif 'patch_embed.proj.weight' in name: + # NOTE: split Conv3D into Conv2Ds + c1, c2, kt, kh, kw = ten.shape + assert kt == 2, "Current implementation only support temporal_patch_size of 2" + tensor_map["v.patch_embd.weight"] = ten[:, :, 0, ...] + tensor_map["v.patch_embd.weight.1"] = ten[:, :, 1, ...] + else: + tensor_map[to_gguf_name(f"visual.{name}")] = ten + + for new_name, ten in tensor_map.items(): + if ten.ndim <= 1 or new_name.endswith("_norm.weight"): + tensor_map[new_name] = ten.astype(np.float32) + else: + tensor_map[new_name] = ten.astype(dtype) + # For Qwen2.5, create a properly sized position embedding tensor + # Size it based on the model's hidden dimension and expected sequence length + seq_length = 40 * 40 # Approximate max sequence length + tensor_map["v.position_embd.weight"] = np.zeros([seq_length, hidden_size], dtype=np.float32) # Properly sized placeholder + return tensor_map + + +def main(args): + if args.data_type == 'fp32': + dtype = torch.float32 + np_dtype = np.float32 + ftype = 0 + elif args.data_type == 'fp16': + dtype = torch.float32 + np_dtype = np.float16 + ftype = 1 + else: + raise ValueError() + + local_model = False + model_path = "" + model_name = args.model_name + print("model_name: ", model_name) + + # Load the model with the specific Qwen2.5 class + model = Qwen2_5_VLForConditionalGeneration.from_pretrained( + model_name, torch_dtype=dtype, device_map="cpu" + ) + cfg = model.config + vcfg = cfg.vision_config + + if os.path.isdir(model_name): + local_model = True + if model_name.endswith(os.sep): + model_name = model_name[:-1] + model_path = model_name + model_name = os.path.basename(model_name) + fname_out = f"{model_name.replace('/', '-').lower()}-vision.gguf" + + fout = GGUFWriter(path=fname_out, arch="clip") + fout.add_description("image encoder for Qwen2.5VL") + + fout.add_file_type(ftype) + fout.add_bool("clip.has_text_encoder", False) + fout.add_bool("clip.has_vision_encoder", True) + fout.add_bool("clip.has_qwen2vl_merger", True) + fout.add_bool("clip.is_qwen2_5", True) # Flag to identify Qwen2.5 models + fout.add_string("clip.projector_type", "qwen2vl_merger") + + print(cfg.vision_config) + # SiLU activation + fout.add_bool("clip.use_silu", True) + fout.add_bool("clip.use_gelu", False) + + # Add missing keys + # 1. mm_patch_merge_type - Qwen2.5 uses a flat merge type + fout.add_string("clip.vision.mm_patch_merge_type", "flat") + + # 2. image_grid_pinpoints - For Qwen2.5, we'll provide standard resolution options + # These are common grid pinpoints for image processing, defining possible resolutions + grid_pinpoints = [224, 224, 336, 336, 448, 448, 560, 560] + fout.add_array("clip.vision.image_grid_pinpoints", grid_pinpoints) + + # 3. feature_layer - Typically set to the last layer(s) for feature extraction + # For Qwen2.5, we'll use the final layer + feature_layers = [vcfg.depth] # Use the last layer + fout.add_array("clip.vision.feature_layer", feature_layers) + + # 4. image_crop_resolution - Set to the same as image_size by default + image_size = 14 * 40 # same as used below + fout.add_uint32("clip.vision.image_crop_resolution", image_size) + + tensor_map = find_vision_tensors(model, np_dtype, vcfg.hidden_size) + for name, data in tensor_map.items(): + fout.add_tensor(name, data) + + fout.add_uint32("clip.vision.patch_size", vcfg.patch_size) + fout.add_uint32("clip.vision.image_size", image_size) # reasonable size divisible by (14*2) + fout.add_uint32(k(KEY_EMBEDDING_LENGTH, VISION), vcfg.hidden_size) + fout.add_uint32("clip.vision.projection_dim", vcfg.hidden_size) + fout.add_uint32(k(KEY_ATTENTION_HEAD_COUNT, VISION), vcfg.num_heads) + fout.add_float32(k(KEY_ATTENTION_LAYERNORM_EPS, VISION), 1e-6) + fout.add_uint32(k(KEY_BLOCK_COUNT, VISION), vcfg.depth) + fout.add_uint32(k(KEY_FEED_FORWARD_LENGTH, VISION), vcfg.intermediate_size) + fout.add_name(model_name) + + # Load the processor using the specific Qwen2.5 processor class + if local_model: + processor = Qwen2_5_VLProcessor.from_pretrained(model_path) + else: + processor = Qwen2_5_VLProcessor.from_pretrained(model_name) + + # Get the image mean and std values from the processor + fout.add_array("clip.vision.image_mean", processor.image_processor.image_mean) + fout.add_array("clip.vision.image_std", processor.image_processor.image_std) + + fout.write_header_to_file() + fout.write_kv_data_to_file() + fout.write_tensors_to_file() + fout.close() + print("save model as: ", fname_out) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("model_name", nargs='?', default="Qwen/Qwen2.5-VL-3B-Instruct") + parser.add_argument("--data_type", nargs='?', choices=['fp32', 'fp16'], default="fp32") + args = parser.parse_args() + main(args)