qwen3vl fixed (+2 squashed commit)

LostRuins · LostRuins · commit 0891b0752dda · 2025-10-31T17:52:33.000+08:00
Squashed commit: [89f65ed] wip fixing q3vl [6fa34cf] wip fixing q3vl
diff --git a/gpttype_adapter.cpp b/gpttype_adapter.cpp
@@ -2366,7 +2366,7 @@ ModelLoadResult gpttype_load_model(const load_model_inputs inputs, FileFormat in
         }
 
         llama_model * llamamodel = llama_model_load_from_file(kcpp_data->model_filename.c_str(), model_params);
-        if(file_format_meta.model_architecture == GGUFArch::ARCH_QWEN2VL || llama_model_rope_type(llamamodel)==LLAMA_ROPE_TYPE_MROPE)
+        if(file_format_meta.model_architecture == GGUFArch::ARCH_QWEN2VL || llama_model_rope_type(llamamodel)==LLAMA_ROPE_TYPE_MROPE || llama_model_rope_type(llamamodel)==LLAMA_ROPE_TYPE_IMROPE)
         {
             printf("\nMRope is used, context shift will be disabled!\n");
             kcpp_data->use_contextshift = false;
diff --git a/otherarch/utils.cpp b/otherarch/utils.cpp
@@ -20,7 +20,7 @@
 #define MA_NO_NODE_GRAPH
 #define MA_NO_ENGINE
 #define MA_NO_GENERATION
-#define MA_API static
+// #define MA_API static
 #include "miniaudio/miniaudio.h"
 
 void utreplace(std::string & str, const std::string & needle, const std::string & replacement) {
diff --git a/tools/mtmd/llava.cpp b/tools/mtmd/llava.cpp
@@ -229,21 +229,15 @@ static clip_image_f32 * reshape_by_patch(clip_image_f32 * image, int patch_size)
     return patch;
 }
 
-static bool encode_image_with_clip(clip_ctx * ctx_clip, int n_threads, const clip_image_u8 * img, float * image_embd, int * n_img_pos) {
-    // std::vector<clip_image_f32*> img_res_v; // format VectN x H x W x RGB (N x 336 x 336 x 3), so interleaved RGB - different to the python implementation which is N x 3 x 336 x 336
-    clip_image_f32_batch_ptr img_res_v(clip_image_f32_batch_init());
-    if (!clip_image_preprocess(ctx_clip, img, img_res_v.get())) {
-        LOG_ERR("%s: unable to preprocess image\n", __func__);
-        return false;
-    }
+static bool encode_image_with_clip(clip_ctx * ctx_clip, int n_threads, struct clip_image_f32_batch * preprocessed_img, float * image_embd, int * n_img_pos) {
 
     const int64_t t_img_enc_start_us = ggml_time_us();
 
     const char * mm_patch_merge_type = clip_patch_merge_type(ctx_clip);
 
-    const size_t n_imgs = clip_image_f32_batch_n_images(img_res_v.get());
+    const size_t n_imgs = clip_image_f32_batch_n_images(preprocessed_img);
 
-    clip_image_f32 * img_res = clip_image_f32_get_img(img_res_v.get(), 0);
+    clip_image_f32 * img_res = clip_image_f32_get_img(preprocessed_img, 0);
     *n_img_pos = clip_n_output_tokens(ctx_clip, img_res);
     bool encoded = clip_image_encode(ctx_clip, n_threads, img_res, image_embd); // image_embd shape is 576 x 4096
     if (!encoded) {
@@ -282,9 +276,25 @@ bool llava_image_embed_make_with_clip_img(clip_ctx * ctx_clip, int n_threads, co
         num_max_patches = 1;
     }
     float * image_embd;
+    clip_image_f32_batch_ptr preprocessed_img(clip_image_f32_batch_init());
+    if (!clip_image_preprocess(ctx_clip, img, preprocessed_img.get())) {
+        LOG_ERR("%s: unable to preprocess image\n", __func__);
+        return false;
+    }
+
     if (clip_is_qwen2vl(ctx_clip)) {
         // qwen2vl don't split image into chunks, so `num_max_patches` is not needed.
-        image_embd = (float *)malloc(clip_embd_nbytes_by_img(ctx_clip, img->nx, img->ny));
+        //sometimes they resize the image LARGER than before (padding up), so we must account for that
+        int max_nx = img->nx;
+        int max_ny = img->ny;
+        for(int i=0;i<preprocessed_img->entries.size();++i)
+        {
+            int a = preprocessed_img->entries[i].get()->nx;
+            int b = preprocessed_img->entries[i].get()->ny;
+            max_nx = std::max(max_nx,a);
+            max_ny = std::max(max_ny,b);
+        }
+        image_embd = (float *)malloc(clip_embd_nbytes_by_img(ctx_clip, max_nx, max_ny));
     } else {
         image_embd = (float *)malloc(clip_embd_nbytes(ctx_clip)*num_max_patches); // TODO: base on gridsize/llava model
     }
@@ -294,7 +304,7 @@ bool llava_image_embed_make_with_clip_img(clip_ctx * ctx_clip, int n_threads, co
     }
 
     int n_img_pos;
-    if (!encode_image_with_clip(ctx_clip, n_threads, img, image_embd, &n_img_pos)) {
+    if (!encode_image_with_clip(ctx_clip, n_threads, preprocessed_img.get(), image_embd, &n_img_pos)) {
         LOG_ERR("%s: cannot encode image, aborting\n", __func__);
         free(image_embd);
         return false;
diff --git a/tools/mtmd/mtmd-helper.cpp b/tools/mtmd/mtmd-helper.cpp
@@ -17,7 +17,7 @@
 
 //#define MTMD_AUDIO_DEBUG
 
-#define MINIAUDIO_IMPLEMENTATION
+// #define MINIAUDIO_IMPLEMENTATION
 #ifndef MTMD_AUDIO_DEBUG
 #   define MA_NO_ENCODING
 #endif
@@ -26,7 +26,7 @@
 #define MA_NO_NODE_GRAPH
 #define MA_NO_ENGINE
 #define MA_NO_GENERATION
-#define MA_API static
+// #define MA_API static
 #include "miniaudio/miniaudio.h"
 
 // #define STB_IMAGE_IMPLEMENTATION

Original file line number	Diff line number	Diff line change
`@@ -2366,7 +2366,7 @@ ModelLoadResult gpttype_load_model(const load_model_inputs inputs, FileFormat in`
`2366`	`2366`	`}`
`2367`	`2367`
`2368`	`2368`	`llama_model * llamamodel = llama_model_load_from_file(kcpp_data->model_filename.c_str(), model_params);`
`2369`		`- if(file_format_meta.model_architecture == GGUFArch::ARCH_QWEN2VL \|\| llama_model_rope_type(llamamodel)==LLAMA_ROPE_TYPE_MROPE)`
	`2369`	`+ if(file_format_meta.model_architecture == GGUFArch::ARCH_QWEN2VL \|\| llama_model_rope_type(llamamodel)==LLAMA_ROPE_TYPE_MROPE \|\| llama_model_rope_type(llamamodel)==LLAMA_ROPE_TYPE_IMROPE)`
`2370`	`2370`	`{`
`2371`	`2371`	`printf("\nMRope is used, context shift will be disabled!\n");`
`2372`	`2372`	`kcpp_data->use_contextshift = false;`