Skip to content

Commit 0891b07

Browse files
committed
qwen3vl fixed (+2 squashed commit)
Squashed commit: [89f65ed] wip fixing q3vl [6fa34cf] wip fixing q3vl
1 parent adec6eb commit 0891b07

File tree

4 files changed

+25
-15
lines changed

4 files changed

+25
-15
lines changed

gpttype_adapter.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2366,7 +2366,7 @@ ModelLoadResult gpttype_load_model(const load_model_inputs inputs, FileFormat in
23662366
}
23672367

23682368
llama_model * llamamodel = llama_model_load_from_file(kcpp_data->model_filename.c_str(), model_params);
2369-
if(file_format_meta.model_architecture == GGUFArch::ARCH_QWEN2VL || llama_model_rope_type(llamamodel)==LLAMA_ROPE_TYPE_MROPE)
2369+
if(file_format_meta.model_architecture == GGUFArch::ARCH_QWEN2VL || llama_model_rope_type(llamamodel)==LLAMA_ROPE_TYPE_MROPE || llama_model_rope_type(llamamodel)==LLAMA_ROPE_TYPE_IMROPE)
23702370
{
23712371
printf("\nMRope is used, context shift will be disabled!\n");
23722372
kcpp_data->use_contextshift = false;

otherarch/utils.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,7 @@
2020
#define MA_NO_NODE_GRAPH
2121
#define MA_NO_ENGINE
2222
#define MA_NO_GENERATION
23-
#define MA_API static
23+
// #define MA_API static
2424
#include "miniaudio/miniaudio.h"
2525

2626
void utreplace(std::string & str, const std::string & needle, const std::string & replacement) {

tools/mtmd/llava.cpp

Lines changed: 21 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -229,21 +229,15 @@ static clip_image_f32 * reshape_by_patch(clip_image_f32 * image, int patch_size)
229229
return patch;
230230
}
231231

232-
static bool encode_image_with_clip(clip_ctx * ctx_clip, int n_threads, const clip_image_u8 * img, float * image_embd, int * n_img_pos) {
233-
// std::vector<clip_image_f32*> img_res_v; // format VectN x H x W x RGB (N x 336 x 336 x 3), so interleaved RGB - different to the python implementation which is N x 3 x 336 x 336
234-
clip_image_f32_batch_ptr img_res_v(clip_image_f32_batch_init());
235-
if (!clip_image_preprocess(ctx_clip, img, img_res_v.get())) {
236-
LOG_ERR("%s: unable to preprocess image\n", __func__);
237-
return false;
238-
}
232+
static bool encode_image_with_clip(clip_ctx * ctx_clip, int n_threads, struct clip_image_f32_batch * preprocessed_img, float * image_embd, int * n_img_pos) {
239233

240234
const int64_t t_img_enc_start_us = ggml_time_us();
241235

242236
const char * mm_patch_merge_type = clip_patch_merge_type(ctx_clip);
243237

244-
const size_t n_imgs = clip_image_f32_batch_n_images(img_res_v.get());
238+
const size_t n_imgs = clip_image_f32_batch_n_images(preprocessed_img);
245239

246-
clip_image_f32 * img_res = clip_image_f32_get_img(img_res_v.get(), 0);
240+
clip_image_f32 * img_res = clip_image_f32_get_img(preprocessed_img, 0);
247241
*n_img_pos = clip_n_output_tokens(ctx_clip, img_res);
248242
bool encoded = clip_image_encode(ctx_clip, n_threads, img_res, image_embd); // image_embd shape is 576 x 4096
249243
if (!encoded) {
@@ -282,9 +276,25 @@ bool llava_image_embed_make_with_clip_img(clip_ctx * ctx_clip, int n_threads, co
282276
num_max_patches = 1;
283277
}
284278
float * image_embd;
279+
clip_image_f32_batch_ptr preprocessed_img(clip_image_f32_batch_init());
280+
if (!clip_image_preprocess(ctx_clip, img, preprocessed_img.get())) {
281+
LOG_ERR("%s: unable to preprocess image\n", __func__);
282+
return false;
283+
}
284+
285285
if (clip_is_qwen2vl(ctx_clip)) {
286286
// qwen2vl don't split image into chunks, so `num_max_patches` is not needed.
287-
image_embd = (float *)malloc(clip_embd_nbytes_by_img(ctx_clip, img->nx, img->ny));
287+
//sometimes they resize the image LARGER than before (padding up), so we must account for that
288+
int max_nx = img->nx;
289+
int max_ny = img->ny;
290+
for(int i=0;i<preprocessed_img->entries.size();++i)
291+
{
292+
int a = preprocessed_img->entries[i].get()->nx;
293+
int b = preprocessed_img->entries[i].get()->ny;
294+
max_nx = std::max(max_nx,a);
295+
max_ny = std::max(max_ny,b);
296+
}
297+
image_embd = (float *)malloc(clip_embd_nbytes_by_img(ctx_clip, max_nx, max_ny));
288298
} else {
289299
image_embd = (float *)malloc(clip_embd_nbytes(ctx_clip)*num_max_patches); // TODO: base on gridsize/llava model
290300
}
@@ -294,7 +304,7 @@ bool llava_image_embed_make_with_clip_img(clip_ctx * ctx_clip, int n_threads, co
294304
}
295305

296306
int n_img_pos;
297-
if (!encode_image_with_clip(ctx_clip, n_threads, img, image_embd, &n_img_pos)) {
307+
if (!encode_image_with_clip(ctx_clip, n_threads, preprocessed_img.get(), image_embd, &n_img_pos)) {
298308
LOG_ERR("%s: cannot encode image, aborting\n", __func__);
299309
free(image_embd);
300310
return false;

tools/mtmd/mtmd-helper.cpp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@
1717

1818
//#define MTMD_AUDIO_DEBUG
1919

20-
#define MINIAUDIO_IMPLEMENTATION
20+
// #define MINIAUDIO_IMPLEMENTATION
2121
#ifndef MTMD_AUDIO_DEBUG
2222
# define MA_NO_ENCODING
2323
#endif
@@ -26,7 +26,7 @@
2626
#define MA_NO_NODE_GRAPH
2727
#define MA_NO_ENGINE
2828
#define MA_NO_GENERATION
29-
#define MA_API static
29+
// #define MA_API static
3030
#include "miniaudio/miniaudio.h"
3131

3232
// #define STB_IMAGE_IMPLEMENTATION

0 commit comments

Comments
 (0)