Skip to content

Commit 0537774

Browse files
authored
Merge pull request #33 from JJJYmmm/add_qwen3vl
qwen3vl - cleanup + test
2 parents bd75d0f + 473ee90 commit 0537774

File tree

2 files changed

+18
-45
lines changed

2 files changed

+18
-45
lines changed

tools/mtmd/clip.cpp

Lines changed: 17 additions & 45 deletions
Original file line numberDiff line numberDiff line change
@@ -196,8 +196,6 @@ struct clip_hparams {
196196
int32_t n_wa_pattern = 0;
197197
int32_t spatial_merge_size = 0;
198198

199-
std::vector<bool> is_deepstack_layers; // qwen3vl: whether the layer is a deepstack layer
200-
201199
// audio
202200
int32_t n_mel_bins = 0; // whisper preprocessor
203201
int32_t proj_stack_factor = 0; // ultravox
@@ -251,6 +249,10 @@ struct clip_layer {
251249
ggml_tensor * deepstack_fc1_b = nullptr;
252250
ggml_tensor * deepstack_fc2_w = nullptr;
253251
ggml_tensor * deepstack_fc2_b = nullptr;
252+
253+
bool has_deepstack() const {
254+
return deepstack_fc1_w != nullptr;
255+
}
254256
};
255257

256258
struct clip_model {
@@ -270,6 +272,8 @@ struct clip_model {
270272

271273
std::vector<clip_layer> layers;
272274

275+
int32_t n_deepstack_layers = 0; // used by Qwen3-VL, calculated from clip_layer
276+
273277
ggml_tensor * post_ln_w;
274278
ggml_tensor * post_ln_b;
275279

@@ -983,7 +987,7 @@ struct clip_graph {
983987
cur = ggml_add(ctx0, inpL, cur);
984988
cb(cur, "layer_out", il);
985989

986-
if (hparams.is_deepstack_layers[il]) {
990+
if (layer.has_deepstack()) {
987991
ggml_tensor * feat = ggml_reshape_3d(ctx0, cur, n_embd * merge_factor, n_pos / merge_factor, batch_size);
988992
feat = build_norm(feat, layer.deepstack_norm_w, layer.deepstack_norm_b, norm_t, eps, il);
989993
feat = build_ffn(feat,
@@ -2565,9 +2569,6 @@ struct clip_model_loader {
25652569
hparams.vision_feature_layer.insert(layer);
25662570
}
25672571

2568-
// set default deepstack layers to false
2569-
hparams.is_deepstack_layers.resize(hparams.n_layer, false);
2570-
25712572
// model-specific params
25722573
switch (model.proj_type) {
25732574
case PROJECTOR_TYPE_MINICPMV:
@@ -2630,7 +2631,6 @@ struct clip_model_loader {
26302631
hparams.image_size = 1024; // still need this?
26312632
hparams.warmup_image_size = hparams.patch_size * 8;
26322633
get_u32(KEY_SPATIAL_MERGE_SIZE, hparams.spatial_merge_size, false);
2633-
get_arr_bool(KEY_IS_DEEPSTACK_LAYERS, hparams.is_deepstack_layers, false);
26342634
} break;
26352635
case PROJECTOR_TYPE_LLAMA4:
26362636
{
@@ -2673,21 +2673,6 @@ struct clip_model_loader {
26732673
if (hparams.spatial_merge_size > 0) {
26742674
LOG_INF("%s: spatial_merge_size: %d\n", __func__, hparams.spatial_merge_size);
26752675
}
2676-
if (!hparams.is_deepstack_layers.empty()) {
2677-
LOG_INF("%s: deepstack enabled layers: ", __func__);
2678-
bool first = true;
2679-
for (size_t i = 0; i < hparams.is_deepstack_layers.size(); ++i) {
2680-
if (hparams.is_deepstack_layers[i]) {
2681-
LOG_CNT("%s%zu", first ? "" : ", ", i);
2682-
first = false;
2683-
}
2684-
}
2685-
if (first) {
2686-
LOG_CNT("none\n");
2687-
} else {
2688-
LOG_CNT("\n");
2689-
}
2690-
}
26912676
} else if (is_audio) {
26922677
LOG_INF("\n--- audio hparams ---\n");
26932678
LOG_INF("%s: n_mel_bins: %d\n", __func__, hparams.n_mel_bins);
@@ -2789,13 +2774,14 @@ struct clip_model_loader {
27892774

27902775

27912776
// qwen3vl deepstack layer
2792-
if (hparams.is_deepstack_layers[il]) {
2793-
layer.deepstack_norm_w = get_tensor(string_format(TN_DEEPSTACK_NORM, il, "weight"), false);
2794-
layer.deepstack_norm_b = get_tensor(string_format(TN_DEEPSTACK_NORM, il, "bias"), false);
2795-
layer.deepstack_fc1_w = get_tensor(string_format(TN_DEEPSTACK_FC1, il, "weight"), false);
2796-
layer.deepstack_fc1_b = get_tensor(string_format(TN_DEEPSTACK_FC1, il, "bias"), false);
2797-
layer.deepstack_fc2_w = get_tensor(string_format(TN_DEEPSTACK_FC2, il, "weight"), false);
2798-
layer.deepstack_fc2_b = get_tensor(string_format(TN_DEEPSTACK_FC2, il, "bias"), false);
2777+
layer.deepstack_norm_w = get_tensor(string_format(TN_DEEPSTACK_NORM, il, "weight"), false);
2778+
layer.deepstack_norm_b = get_tensor(string_format(TN_DEEPSTACK_NORM, il, "bias"), false);
2779+
layer.deepstack_fc1_w = get_tensor(string_format(TN_DEEPSTACK_FC1, il, "weight"), false);
2780+
layer.deepstack_fc1_b = get_tensor(string_format(TN_DEEPSTACK_FC1, il, "bias"), false);
2781+
layer.deepstack_fc2_w = get_tensor(string_format(TN_DEEPSTACK_FC2, il, "weight"), false);
2782+
layer.deepstack_fc2_b = get_tensor(string_format(TN_DEEPSTACK_FC2, il, "bias"), false);
2783+
if (layer.has_deepstack()) {
2784+
model.n_deepstack_layers++;
27992785
}
28002786

28012787
// some models already exported with legacy (incorrect) naming which is quite messy, let's fix it here
@@ -3155,21 +3141,6 @@ struct clip_model_loader {
31553141
}
31563142
}
31573143

3158-
void get_arr_bool(const std::string & key, std::vector<bool> & output, bool required = true) {
3159-
const int i = gguf_find_key(ctx_gguf.get(), key.c_str());
3160-
if (i < 0) {
3161-
if (required) throw std::runtime_error("Key not found: " + key);
3162-
return;
3163-
}
3164-
3165-
const int n = gguf_get_arr_n(ctx_gguf.get(), i);
3166-
output.resize(n);
3167-
const bool * values = (const bool *)gguf_get_arr_data(ctx_gguf.get(), i);
3168-
for (int i = 0; i < n; ++i) {
3169-
output[i] = values[i];
3170-
}
3171-
}
3172-
31733144
void set_llava_uhd_res_candidates(clip_model & model, const int max_patches_per_side) {
31743145
auto & hparams = model.hparams;
31753146
for (int x = 1; x <= max_patches_per_side; x++) {
@@ -4676,7 +4647,8 @@ int clip_n_mmproj_embd(const struct clip_ctx * ctx) {
46764647
case PROJECTOR_TYPE_QWEN25VL:
46774648
return ctx->model.mm_1_b->ne[0];
46784649
case PROJECTOR_TYPE_QWEN3VL:
4679-
return ctx->model.mm_1_b->ne[0] * (1 + std::count(ctx->model.hparams.is_deepstack_layers.begin(), ctx->model.hparams.is_deepstack_layers.end(), true)); // main path + deepstack paths
4650+
// main path + deepstack paths
4651+
return ctx->model.mm_1_b->ne[0] * (1 + ctx->model.n_deepstack_layers);
46804652
case PROJECTOR_TYPE_GEMMA3:
46814653
return ctx->model.mm_input_proj_w->ne[0];
46824654
case PROJECTOR_TYPE_IDEFICS3:

tools/mtmd/tests.sh

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -84,6 +84,7 @@ if [ "$RUN_BIG_TESTS" = true ]; then
8484
add_test_vision "ggml-org/Qwen2-VL-7B-Instruct-GGUF:Q4_K_M"
8585
add_test_vision "ggml-org/Qwen2.5-VL-3B-Instruct-GGUF:Q4_K_M"
8686
add_test_vision "ggml-org/Qwen2.5-VL-7B-Instruct-GGUF:Q4_K_M"
87+
add_test_vision "ggml-org/Qwen3-VL-2B-Instruct-GGUF:Q8_0"
8788
add_test_vision "ggml-org/InternVL3-8B-Instruct-GGUF:Q4_K_M"
8889
add_test_vision "ggml-org/InternVL3-14B-Instruct-GGUF:Q4_K_M"
8990
add_test_vision "ggml-org/Qwen2.5-Omni-7B-GGUF:Q4_K_M"

0 commit comments

Comments
 (0)