diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py
index 6358a94e9b55f..46631959f132d 100755
--- a/convert_hf_to_gguf.py
+++ b/convert_hf_to_gguf.py
@@ -2236,6 +2236,29 @@ def get_tensors(self) -> Iterator[tuple[str, Tensor]]:
                 continue
             yield name, data
 
+# Additional handling for 2.5 seems not to be needed.
+@Model.register("Qwen2_5_VLForConditionalGeneration")
+class Qwen2_5_VLModel(Model):
+    model_arch = gguf.MODEL_ARCH.QWEN2VL
+
+    def set_gguf_parameters(self):
+        super().set_gguf_parameters()
+        mrope_section = self.hparams["rope_scaling"]["mrope_section"]
+        mrope_section += [0] * max(0, 4 - len(mrope_section))
+        self.gguf_writer.add_rope_dimension_sections(mrope_section)
+
+    def set_vocab(self):
+        try:
+            self._set_vocab_sentencepiece()
+        except FileNotFoundError:
+            self._set_vocab_gpt2()
+
+    def get_tensors(self) -> Iterator[tuple[str, Tensor]]:
+        for name, data in super().get_tensors():
+            if name.startswith("visual."):
+                continue
+            yield name, data
+
 
 @Model.register("WavTokenizerDec")
 class WavTokenizerDecModel(Model):
diff --git a/examples/llava/clip.cpp b/examples/llava/clip.cpp
index 76d4a78520575..b45ad5622e840 100644
--- a/examples/llava/clip.cpp
+++ b/examples/llava/clip.cpp
@@ -9,25 +9,25 @@
 #include "ggml-backend.h"
 #include "gguf.h"
 
-//#ifdef GGML_USE_CUDA
-//#include "ggml-cuda.h"
-//#endif
-//
-//#ifdef GGML_USE_SYCL
-//#include "ggml-sycl.h"
-//#endif
-//
-//#ifdef GGML_USE_METAL
-//#include "ggml-metal.h"
-//#endif
-//
-//#ifdef GGML_USE_CANN
-//#include "ggml-cann.h"
-//#endif
-//
-//#ifdef GGML_USE_VULKAN
-//#include "ggml-vulkan.h"
-//#endif
+#ifdef GGML_USE_CUDA
+#include "ggml-cuda.h"
+#endif
+
+#ifdef GGML_USE_SYCL
+#include "ggml-sycl.h"
+#endif
+
+#ifdef GGML_USE_METAL
+#include "ggml-metal.h"
+#endif
+
+#ifdef GGML_USE_CANN
+#include "ggml-cann.h"
+#endif
+
+#ifdef GGML_USE_VULKAN
+#include "ggml-vulkan.h"
+#endif
 
 #define STB_IMAGE_IMPLEMENTATION
 #include "stb_image.h"
@@ -106,6 +106,8 @@ static std::string format(const char * fmt, ...) {
 #define KEY_HAS_GLM_PROJ        "clip.has_glm_projector"
 #define KEY_MINICPMV_VERSION    "clip.minicpmv_version"
 #define KEY_HAS_QWEN2VL_MERGER  "clip.has_qwen2vl_merger"
+#define KEY_IS_QWEN2_5          "clip.is_qwen2_5"
+#define KEY_RMS_NORM_EPS        "clip.%s.attention.rms_norm_epsilon"
 #define KEY_USE_GELU            "clip.use_gelu"
 #define KEY_USE_SILU            "clip.use_silu"
 #define KEY_N_EMBD              "clip.%s.embedding_length"
@@ -583,6 +585,7 @@ struct clip_ctx {
     bool has_minicpmv_projector = false;
     bool has_glm_projector = false;
     bool has_qwen2vl_merger = false;
+    bool is_qwen2_5 = false;
     int minicpmv_version = 2;
 
     struct clip_vision_model vision_model;
@@ -734,7 +737,10 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
     if (ctx->has_minicpmv_projector) {
         int pos_w = image_size_width/patch_size;
         int pos_h = image_size_height/patch_size;
-        if (ctx->minicpmv_version == 2) {
+        if (ctx->is_qwen2_5) {
+            pos_embed = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, 2048, pos_w * pos_h, 1);
+        }
+        else if (ctx->minicpmv_version == 2) {
             pos_embed = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, 4096, pos_w * pos_h, 1);
         }
         else if (ctx->minicpmv_version == 3) {
@@ -774,8 +780,14 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
         {
             cur = ggml_norm(ctx0, cur, eps);
 
-            cur = ggml_add(ctx0, ggml_mul(ctx0, cur, model.layers[il].ln_1_w),
-                           model.layers[il].ln_1_b);
+            if (ctx->is_qwen2_5) {
+                // RMSNorm for Qwen2.5 (no bias)
+                cur = ggml_mul(ctx0, cur, model.layers[il].ln_1_w);
+            } else {
+                // Standard LayerNorm with bias
+                cur = ggml_add(ctx0, ggml_mul(ctx0, cur, model.layers[il].ln_1_w),
+                               model.layers[il].ln_1_b);
+            }
         }
 
         // self-attention
@@ -834,22 +846,47 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
         {
             cur = ggml_norm(ctx0, cur, eps);
 
-            cur = ggml_add(ctx0, ggml_mul(ctx0, cur, model.layers[il].ln_2_w), model.layers[il].ln_2_b);
+            if (ctx->is_qwen2_5) {
+                // RMSNorm for Qwen2.5 (no bias)
+                cur = ggml_mul(ctx0, cur, model.layers[il].ln_2_w);
+            } else {
+                // Standard LayerNorm with bias
+                cur = ggml_add(ctx0, ggml_mul(ctx0, cur, model.layers[il].ln_2_w),
+                               model.layers[il].ln_2_b);
+            }
         }
 
-        cur = ggml_mul_mat(ctx0, model.layers[il].ff_i_w, cur);
-        cur = ggml_add(ctx0, cur, model.layers[il].ff_i_b);
+        // For Qwen2.5, the MLP uses SiLU gated activation
+        if (ctx->is_qwen2_5) {
+            // Qwen2.5 uses SiLU gated activation
+            // ffn_down is the gate_proj, ffn_up is the up_proj
+            struct ggml_tensor * gate = ggml_mul_mat(ctx0, model.layers[il].ff_i_w, cur);
+            struct ggml_tensor * up = ggml_mul_mat(ctx0, model.layers[il].ff_i_b, cur); // using ff_i_b as up_proj weight
+
+            // Apply SiLU to the gate
+            gate = ggml_silu_inplace(ctx0, gate);
+
+            // Multiply gate and up
+            cur = ggml_mul(ctx0, gate, up);
 
-        if (ctx->use_gelu) {
-            cur = ggml_gelu_inplace(ctx0, cur);
-        } else if (ctx->use_silu) {
-            cur = ggml_silu_inplace(ctx0, cur);
+            // Apply down projection
+            cur = ggml_mul_mat(ctx0, model.layers[il].ff_o_w, cur);
         } else {
-            cur = ggml_gelu_quick_inplace(ctx0, cur);
-        }
+            // Original MLP
+            cur = ggml_mul_mat(ctx0, model.layers[il].ff_i_w, cur);
+            cur = ggml_add(ctx0, cur, model.layers[il].ff_i_b);
+
+            if (ctx->use_gelu) {
+                cur = ggml_gelu_inplace(ctx0, cur);
+            } else if (ctx->use_silu) {
+                cur = ggml_silu_inplace(ctx0, cur);
+            } else {
+                cur = ggml_gelu_quick_inplace(ctx0, cur);
+            }
 
-        cur = ggml_mul_mat(ctx0, model.layers[il].ff_o_w, cur);
-        cur = ggml_add(ctx0, cur, model.layers[il].ff_o_b);
+            cur = ggml_mul_mat(ctx0, model.layers[il].ff_o_w, cur);
+            cur = ggml_add(ctx0, cur, model.layers[il].ff_o_b);
+        }
 
         // residual 2
         cur = ggml_add(ctx0, embeddings, cur);
@@ -1085,7 +1122,12 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
                 const int d_head = 128;
                 int n_head = hidden_size/d_head;
                 int num_query = 96;
-                if (ctx->minicpmv_version == 2) {
+                if (ctx->is_qwen2_5) {
+                    hidden_size = 2048;
+                    n_head = hidden_size/d_head;
+                    num_query = 64;
+                }
+                else if (ctx->minicpmv_version == 2) {
                     hidden_size = 4096;
                     n_head = hidden_size/d_head;
                     num_query = 96;
@@ -1296,30 +1338,30 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
         }
     }
 
-//#ifdef GGML_USE_CUDA
-//    new_clip->backend = ggml_backend_cuda_init(0);
-//    LOG_INF("%s: CLIP using CUDA backend\n", __func__);
-//#endif
-//
-//#ifdef GGML_USE_METAL
-//    new_clip->backend = ggml_backend_metal_init();
-//    LOG_INF("%s: CLIP using Metal backend\n", __func__);
-//#endif
-//
-//#ifdef GGML_USE_CANN
-//    new_clip->backend = ggml_backend_cann_init(0);
-//    LOG_INF("%s: CLIP using CANN backend\n", __func__);
-//#endif
-//
-//#ifdef GGML_USE_VULKAN
-//    new_clip->backend = ggml_backend_vk_init(0);
-//    LOG_INF("%s: CLIP using Vulkan backend\n", __func__);
-//#endif
-//
-//#ifdef GGML_USE_SYCL
-//    new_clip->backend = ggml_backend_sycl_init(0);
-//    LOG_INF("%s: CLIP using SYCL backend\n", __func__);
-//#endif
+#ifdef GGML_USE_CUDA
+    new_clip->backend = ggml_backend_cuda_init(0);
+    LOG_INF("%s: CLIP using CUDA backend\n", __func__);
+#endif
+
+#ifdef GGML_USE_METAL
+    new_clip->backend = ggml_backend_metal_init();
+    LOG_INF("%s: CLIP using Metal backend\n", __func__);
+#endif
+
+#ifdef GGML_USE_CANN
+    new_clip->backend = ggml_backend_cann_init(0);
+    LOG_INF("%s: CLIP using CANN backend\n", __func__);
+#endif
+
+#ifdef GGML_USE_VULKAN
+    new_clip->backend = ggml_backend_vk_init(0);
+    LOG_INF("%s: CLIP using Vulkan backend\n", __func__);
+#endif
+
+#ifdef GGML_USE_SYCL
+    new_clip->backend = ggml_backend_sycl_init(0);
+    LOG_INF("%s: CLIP using SYCL backend\n", __func__);
+#endif
 
     if (!new_clip->backend) {
         new_clip->backend = ggml_backend_cpu_init();
@@ -1360,6 +1402,11 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
         }
         // GGML_ASSERT(new_clip->has_llava_projector); // see monatis/clip.cpp for image and/or text encoding for semantic search
 
+        idx = gguf_find_key(ctx, KEY_IS_QWEN2_5);
+        if (idx != -1) {
+            new_clip->is_qwen2_5 = gguf_get_val_bool(ctx, idx);
+        }
+
         GGML_ASSERT(new_clip->has_vision_encoder);
         GGML_ASSERT(!new_clip->has_text_encoder);
 
@@ -2942,7 +2989,10 @@ int clip_n_mmproj_embd(const struct clip_ctx * ctx) {
         return ctx->vision_model.mm_3_b->ne[0];
     }
     if (ctx->proj_type == PROJECTOR_TYPE_RESAMPLER) {
-        if (ctx->minicpmv_version == 2) {
+        if (ctx->is_qwen2_5) {
+            return 2048;
+        }
+        else if (ctx->minicpmv_version == 2) {
             return 4096;
         }
         else if (ctx->minicpmv_version == 3) {
@@ -2956,6 +3006,11 @@ int clip_n_mmproj_embd(const struct clip_ctx * ctx) {
         return ctx->vision_model.mm_model_mlp_3_w->ne[1];
     }
     if (ctx->proj_type == PROJECTOR_TYPE_MERGER) {
+        // For Qwen2.5, the output dimension is 2048 instead of 3584
+        if (ctx->is_qwen2_5) {
+            LOG_INF("%s: Qwen2.5 detected, using output dimension 2048\n", __func__);
+            return 2048;
+        }
         return ctx->vision_model.mm_1_b->ne[0];
     }
 
@@ -2976,6 +3031,9 @@ bool clip_is_glm(const struct clip_ctx * ctx) {
 bool clip_is_qwen2vl(const struct clip_ctx * ctx) {
     return ctx->has_qwen2vl_merger;
 }
+bool clip_is_qwen2_5vl(const struct clip_ctx * ctx) {
+    return ctx->is_qwen2_5;
+}
 
 // Determine the number of encoder layers to iterate over
 int get_deepest_feature_layer(const struct clip_ctx * ctx) {
diff --git a/examples/llava/clip.h b/examples/llava/clip.h
index 002c419653a01..ba852b5bf0501 100644
--- a/examples/llava/clip.h
+++ b/examples/llava/clip.h
@@ -98,6 +98,7 @@ CLIP_API bool clip_model_quantize(const char * fname_inp, const char * fname_out
 CLIP_API int clip_is_minicpmv(const struct clip_ctx * ctx);
 CLIP_API bool clip_is_glm(const struct clip_ctx * ctx);
 CLIP_API bool clip_is_qwen2vl(const struct clip_ctx * ctx);
+CLIP_API bool clip_is_qwen2_5vl(const struct clip_ctx * ctx);
 
 CLIP_API int get_deepest_feature_layer(const struct clip_ctx * ctx);
 
diff --git a/examples/llava/qwen2_5_vl_surgery.py b/examples/llava/qwen2_5_vl_surgery.py
new file mode 100644
index 0000000000000..8d93df2af36f6
--- /dev/null
+++ b/examples/llava/qwen2_5_vl_surgery.py
@@ -0,0 +1,225 @@
+import argparse
+from typing import Dict
+
+import torch
+from gguf import *
+from transformers import (
+    Qwen2_5_VLForConditionalGeneration,
+    Qwen2_5_VLProcessor,
+)
+
+VISION = "clip.vision"
+
+
+def k(raw_key: str, arch: str) -> str:
+    return raw_key.format(arch=arch)
+
+
+def to_gguf_name(name: str) -> str:
+    og = name
+    name = name.replace("text_model", "t").replace("visual", "v")
+    name = name.replace("blocks", "blk").replace("embeddings.", "")
+    name = name.replace("attn.", "attn_")
+
+    # Handle new Qwen2.5 MLP structure
+    if "mlp.gate_proj" in name:
+        name = name.replace("mlp.gate_proj", "ffn_gate")
+    elif "mlp.up_proj" in name:
+        name = name.replace("mlp.up_proj", "ffn_up")
+    elif "mlp.down_proj" in name:
+        name = name.replace("mlp.down_proj", "ffn_down")
+    else:
+        name = name.replace("mlp.fc1", "ffn_down").replace("mlp.fc2", "ffn_up")
+
+    name = name.replace("proj.", "out.")
+    name = name.replace("norm1", "ln1").replace("norm2", "ln2")
+    name = name.replace("merger.mlp", 'mm')
+
+    # For RMSNorm, which doesn't have bias
+    if "weight_g" in name:
+        name = name.replace("weight_g", "weight")
+
+    # Special handling for merger tensors to match clip.cpp expectations
+    if "merger.mlp" in name:
+        # Extract the layer number
+        parts = name.split(".")
+        for i, part in enumerate(parts):
+            if part == "mlp" and i + 1 < len(parts):
+                layer_num = parts[i + 1]
+                # Map the merger layers to the expected GGUF tensor names
+                # Note: clip.cpp looks for mm.0.* and mm.2.* (not mm.1.*)
+                if layer_num == "0":
+                    name = name.replace(f"merger.mlp.{layer_num}", "mm.0")
+                elif layer_num == "1":
+                    name = name.replace(f"merger.mlp.{layer_num}", "mm.2")
+                break
+
+    print(f"[to_gguf_name] {og} --> {name}")
+    return name
+
+
+def find_vision_tensors(model, dtype, hidden_size) -> Dict[str, np.ndarray]:
+    visual = model.visual
+    tensor_map = {}
+
+    for name, ten in visual.state_dict().items():
+        ten = ten.numpy()
+        if 'qkv' in name:
+            if ten.ndim == 2:  # weight
+                c3, _ = ten.shape
+            else:  # bias
+                c3 = ten.shape[0]
+            assert c3 % 3 == 0
+            c = c3 // 3
+            wq = ten[:c]
+            wk = ten[c: c * 2]
+            wv = ten[c * 2:]
+            tensor_map[to_gguf_name(f"visual.{name}").replace("qkv", "q")] = wq
+            tensor_map[to_gguf_name(f"visual.{name}").replace("qkv", "k")] = wk
+            tensor_map[to_gguf_name(f"visual.{name}").replace("qkv", "v")] = wv
+        elif 'merger' in name:
+            if name.endswith("ln_q.weight_g"):
+                tensor_map['v.post_ln.weight'] = ten
+            elif name.endswith("ln_q.bias") and 'weight_g' not in name:
+                tensor_map['v.post_ln.bias'] = ten
+            else:
+                # Handle merger tensors with special attention to naming
+                # First, determine if this is a layer 0 or layer 1 tensor
+                if "merger.mlp.0" in name:
+                    # First layer gets mapped to mm.0.*
+                    if "weight" in name:
+                        tensor_map["mm.0.weight"] = ten
+                    elif "bias" in name:
+                        tensor_map["mm.0.bias"] = ten
+                elif "merger.mlp.1" in name:
+                    # Second layer gets mapped to mm.2.* (not mm.1.*)
+                    if "weight" in name:
+                        tensor_map["mm.2.weight"] = ten
+                    elif "bias" in name:
+                        tensor_map["mm.2.bias"] = ten
+                else:
+                    # For any other tensors, use the standard naming conversion
+                    tensor_map[to_gguf_name(name)] = ten
+        elif 'patch_embed.proj.weight' in name:
+            # NOTE: split Conv3D into Conv2Ds
+            c1, c2, kt, kh, kw = ten.shape
+            assert kt == 2, "Current implementation only support temporal_patch_size of 2"
+            tensor_map["v.patch_embd.weight"] = ten[:, :, 0, ...]
+            tensor_map["v.patch_embd.weight.1"] = ten[:, :, 1, ...]
+        else:
+            tensor_map[to_gguf_name(f"visual.{name}")] = ten
+
+    for new_name, ten in tensor_map.items():
+        if ten.ndim <= 1 or new_name.endswith("_norm.weight"):
+            tensor_map[new_name] = ten.astype(np.float32)
+        else:
+            tensor_map[new_name] = ten.astype(dtype)
+    # For Qwen2.5, create a properly sized position embedding tensor
+    # Size it based on the model's hidden dimension and expected sequence length
+    seq_length = 40 * 40  # Approximate max sequence length
+    tensor_map["v.position_embd.weight"] = np.zeros([seq_length, hidden_size], dtype=np.float32)  # Properly sized placeholder
+    return tensor_map
+
+
+def main(args):
+    if args.data_type == 'fp32':
+        dtype = torch.float32
+        np_dtype = np.float32
+        ftype = 0
+    elif args.data_type == 'fp16':
+        dtype = torch.float32
+        np_dtype = np.float16
+        ftype = 1
+    else:
+        raise ValueError()
+
+    local_model = False
+    model_path = ""
+    model_name = args.model_name
+    print("model_name: ", model_name)
+
+    # Load the model with the specific Qwen2.5 class
+    model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
+        model_name, torch_dtype=dtype, device_map="cpu"
+    )
+    cfg = model.config
+    vcfg = cfg.vision_config
+
+    if os.path.isdir(model_name):
+        local_model = True
+        if model_name.endswith(os.sep):
+            model_name = model_name[:-1]
+        model_path = model_name
+        model_name = os.path.basename(model_name)
+    fname_out = f"{model_name.replace('/', '-').lower()}-vision.gguf"
+
+    fout = GGUFWriter(path=fname_out, arch="clip")
+    fout.add_description("image encoder for Qwen2.5VL")
+
+    fout.add_file_type(ftype)
+    fout.add_bool("clip.has_text_encoder", False)
+    fout.add_bool("clip.has_vision_encoder", True)
+    fout.add_bool("clip.has_qwen2vl_merger", True)
+    fout.add_bool("clip.is_qwen2_5", True)  # Flag to identify Qwen2.5 models
+    fout.add_string("clip.projector_type", "qwen2vl_merger")
+
+    print(cfg.vision_config)
+    # SiLU activation
+    fout.add_bool("clip.use_silu", True)
+    fout.add_bool("clip.use_gelu", False)
+
+    # Add missing keys
+    # 1. mm_patch_merge_type - Qwen2.5 uses a flat merge type
+    fout.add_string("clip.vision.mm_patch_merge_type", "flat")
+
+    # 2. image_grid_pinpoints - For Qwen2.5, we'll provide standard resolution options
+    # These are common grid pinpoints for image processing, defining possible resolutions
+    grid_pinpoints = [224, 224, 336, 336, 448, 448, 560, 560]
+    fout.add_array("clip.vision.image_grid_pinpoints", grid_pinpoints)
+
+    # 3. feature_layer - Typically set to the last layer(s) for feature extraction
+    # For Qwen2.5, we'll use the final layer
+    feature_layers = [vcfg.depth]  # Use the last layer
+    fout.add_array("clip.vision.feature_layer", feature_layers)
+
+    # 4. image_crop_resolution - Set to the same as image_size by default
+    image_size = 14 * 40  # same as used below
+    fout.add_uint32("clip.vision.image_crop_resolution", image_size)
+
+    tensor_map = find_vision_tensors(model, np_dtype, vcfg.hidden_size)
+    for name, data in tensor_map.items():
+        fout.add_tensor(name, data)
+
+    fout.add_uint32("clip.vision.patch_size", vcfg.patch_size)
+    fout.add_uint32("clip.vision.image_size", image_size)  # reasonable size divisible by (14*2)
+    fout.add_uint32(k(KEY_EMBEDDING_LENGTH, VISION), vcfg.hidden_size)
+    fout.add_uint32("clip.vision.projection_dim", vcfg.hidden_size)
+    fout.add_uint32(k(KEY_ATTENTION_HEAD_COUNT, VISION), vcfg.num_heads)
+    fout.add_float32(k(KEY_ATTENTION_LAYERNORM_EPS, VISION), 1e-6)
+    fout.add_uint32(k(KEY_BLOCK_COUNT, VISION), vcfg.depth)
+    fout.add_uint32(k(KEY_FEED_FORWARD_LENGTH, VISION), vcfg.intermediate_size)
+    fout.add_name(model_name)
+
+    # Load the processor using the specific Qwen2.5 processor class
+    if local_model:
+        processor = Qwen2_5_VLProcessor.from_pretrained(model_path)
+    else:
+        processor = Qwen2_5_VLProcessor.from_pretrained(model_name)
+
+    # Get the image mean and std values from the processor
+    fout.add_array("clip.vision.image_mean", processor.image_processor.image_mean)
+    fout.add_array("clip.vision.image_std", processor.image_processor.image_std)
+
+    fout.write_header_to_file()
+    fout.write_kv_data_to_file()
+    fout.write_tensors_to_file()
+    fout.close()
+    print("save model as: ", fname_out)
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("model_name", nargs='?', default="Qwen/Qwen2.5-VL-3B-Instruct")
+    parser.add_argument("--data_type", nargs='?', choices=['fp32', 'fp16'], default="fp32")
+    args = parser.parse_args()
+    main(args)