@@ -254,7 +254,9 @@ struct clip_vision_model {
254254 ggml_tensor * post_ln_w;
255255 ggml_tensor * post_ln_b;
256256
257- ggml_tensor * projection;
257+ ggml_tensor * projection; // TODO: rename it to fc (fully connected layer)
258+ ggml_tensor * mm_fc_w;
259+ ggml_tensor * mm_fc_b;
258260
259261 // LLaVA projection
260262 ggml_tensor * mm_input_norm_w = nullptr ;
@@ -1471,48 +1473,58 @@ struct clip_graph {
14711473
14721474 cb (cur, " after_transformer" , -1 );
14731475
1474- // StackAudioFrames
1475- // https://huggingface.co/fixie-ai/ultravox-v0_5-llama-3_2-1b/blob/main/ultravox_model.py
1476- {
1477- int64_t stride = n_embd * hparams.proj_stack_factor ;
1478- int64_t padded_len = GGML_PAD (ggml_nelements (cur), stride);
1479- int64_t pad = padded_len - ggml_nelements (cur);
1480- if (pad > 0 ) {
1481- cur = ggml_view_1d (ctx0, cur, ggml_nelements (cur), 0 );
1482- cur = ggml_pad (ctx0, cur, pad, 0 , 0 , 0 );
1476+ if (ctx->proj_type == PROJECTOR_TYPE_ULTRAVOX) {
1477+ // StackAudioFrames
1478+ // https://huggingface.co/fixie-ai/ultravox-v0_5-llama-3_2-1b/blob/main/ultravox_model.py
1479+ {
1480+ int64_t stride = n_embd * hparams.proj_stack_factor ;
1481+ int64_t padded_len = GGML_PAD (ggml_nelements (cur), stride);
1482+ int64_t pad = padded_len - ggml_nelements (cur);
1483+ if (pad > 0 ) {
1484+ cur = ggml_view_1d (ctx0, cur, ggml_nelements (cur), 0 );
1485+ cur = ggml_pad (ctx0, cur, pad, 0 , 0 , 0 );
1486+ }
1487+ cur = ggml_view_2d (ctx0, cur, stride, padded_len / stride,
1488+ ggml_row_size (cur->type , stride), 0 );
14831489 }
1484- cur = ggml_view_2d (ctx0, cur, stride, padded_len / stride,
1485- ggml_row_size (cur->type , stride), 0 );
1486- }
14871490
1488- cb (cur, " after_stacked" , -1 );
1491+ cb (cur, " after_stacked" , -1 );
14891492
1490- // UltravoxProjector
1491- {
1492- // pre-norm
1493- cur = ggml_rms_norm (ctx0, cur, 1e-6 );
1494- cur = ggml_mul (ctx0, cur, model.mm_norm_pre_w );
1493+ // UltravoxProjector
1494+ {
1495+ // pre-norm
1496+ cur = ggml_rms_norm (ctx0, cur, 1e-6 );
1497+ cur = ggml_mul (ctx0, cur, model.mm_norm_pre_w );
14951498
1496- // ffn in
1497- cur = ggml_mul_mat (ctx0, model.mm_1_w , cur);
1499+ // ffn in
1500+ cur = ggml_mul_mat (ctx0, model.mm_1_w , cur);
14981501
1499- // swiglu
1500- {
1501- int64_t split_point = cur->ne [0 ] / 2 ;
1502- ggml_tensor * x0 = ggml_cont (ctx0, ggml_view_2d (ctx0, cur, split_point, cur->ne [1 ], cur->nb [1 ], 0 ));
1503- ggml_tensor * x1 = ggml_cont (ctx0, ggml_view_2d (ctx0, cur, split_point, cur->ne [1 ], cur->nb [1 ], split_point * ggml_element_size (cur)));
1502+ // swiglu
1503+ {
1504+ int64_t split_point = cur->ne [0 ] / 2 ;
1505+ ggml_tensor * x0 = ggml_cont (ctx0, ggml_view_2d (ctx0, cur, split_point, cur->ne [1 ], cur->nb [1 ], 0 ));
1506+ ggml_tensor * x1 = ggml_cont (ctx0, ggml_view_2d (ctx0, cur, split_point, cur->ne [1 ], cur->nb [1 ], split_point * ggml_element_size (cur)));
1507+
1508+ // see SwiGLU in ultravox_model.py, the second half passed through is silu, not the first half
1509+ x1 = ggml_silu (ctx0, x1);
1510+ cur = ggml_mul (ctx0, x0, x1);
1511+ }
15041512
1505- // see SwiGLU in ultravox_model.py, the second half passed through is silu, not the first half
1506- x1 = ggml_silu (ctx0, x1);
1507- cur = ggml_mul (ctx0, x0, x1);
1513+ // mid-norm
1514+ cur = ggml_rms_norm (ctx0, cur, 1e-6 );
1515+ cur = ggml_mul (ctx0, cur, model.mm_norm_mid_w );
1516+
1517+ // ffn out
1518+ cur = ggml_mul_mat (ctx0, model.mm_2_w , cur);
15081519 }
15091520
1510- // mid-norm
1511- cur = ggml_rms_norm (ctx0, cur, 1e-6 );
1512- cur = ggml_mul (ctx0, cur, model.mm_norm_mid_w );
1521+ } else if (ctx->proj_type == PROJECTOR_TYPE_QWEN2A) {
1522+ // projector
1523+ cur = ggml_mul_mat (ctx0, model.mm_fc_w , cur);
1524+ cur = ggml_add (ctx0, cur, model.mm_fc_b );
15131525
1514- // ffn out
1515- cur = ggml_mul_mat (ctx0, model. mm_2_w , cur );
1526+ } else {
1527+ GGML_ABORT ( " %s: unknown projector type " , __func__ );
15161528 }
15171529
15181530 cb (cur, " projected" , -1 );
@@ -1655,6 +1667,17 @@ struct clip_graph {
16551667 inpL = cur;
16561668 }
16571669
1670+ // TODO @ngxson : find a way to move this outside
1671+ if (ctx->proj_type == PROJECTOR_TYPE_QWEN2A) {
1672+ ggml_tensor * cur = inpL;
1673+ cur = ggml_transpose (ctx0, cur);
1674+ cur = ggml_cont (ctx0, cur);
1675+ cur = ggml_pool_1d (ctx0, cur, GGML_OP_POOL_AVG, 2 , 2 , 0 );
1676+ cur = ggml_transpose (ctx0, cur);
1677+ cur = ggml_cont (ctx0, cur);
1678+ inpL = cur;
1679+ }
1680+
16581681 // post-layernorm
16591682 if (model.post_ln_w ) {
16601683 inpL = build_norm (inpL, model.post_ln_w , model.post_ln_b , norm_t , eps, -1 );
@@ -1952,6 +1975,7 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
19521975 res = graph.build_llama4 ();
19531976 } break ;
19541977 case PROJECTOR_TYPE_ULTRAVOX:
1978+ case PROJECTOR_TYPE_QWEN2A:
19551979 {
19561980 res = graph.build_whisper_enc ();
19571981 } break ;
@@ -2186,8 +2210,10 @@ struct clip_model_loader {
21862210 };
21872211 } break ;
21882212 case PROJECTOR_TYPE_ULTRAVOX:
2213+ case PROJECTOR_TYPE_QWEN2A:
21892214 {
2190- get_u32 (KEY_A_PROJ_STACK_FACTOR, hparams.proj_stack_factor );
2215+ bool require_stack = ctx_clip.proj_type == PROJECTOR_TYPE_ULTRAVOX;
2216+ get_u32 (KEY_A_PROJ_STACK_FACTOR, hparams.proj_stack_factor , require_stack);
21912217 if (hparams.n_mel_bins != 128 ) {
21922218 throw std::runtime_error (string_format (" %s: only 128 mel bins are supported for ultravox\n " , __func__));
21932219 }
@@ -2266,7 +2292,7 @@ struct clip_model_loader {
22662292 return cur;
22672293 };
22682294
2269- auto & vision_model = ctx_clip.vision_model ;
2295+ auto & vision_model = ctx_clip.vision_model ; // TODO: rename this to just "model"
22702296
22712297 vision_model.class_embedding = get_tensor (TN_CLASS_EMBD, false );
22722298
@@ -2463,6 +2489,15 @@ struct clip_model_loader {
24632489 vision_model.mm_norm_pre_w = get_tensor (string_format (TN_MM_NORM_PRE, " weight" ));
24642490 vision_model.mm_norm_mid_w = get_tensor (string_format (TN_MM_NORM_MID, " weight" ));
24652491 } break ;
2492+ case PROJECTOR_TYPE_QWEN2A:
2493+ {
2494+ vision_model.conv1d_1_w = get_tensor (string_format (TN_CONV1D, 1 , " weight" ));
2495+ vision_model.conv1d_1_b = get_tensor (string_format (TN_CONV1D, 1 , " bias" ));
2496+ vision_model.conv1d_2_w = get_tensor (string_format (TN_CONV1D, 2 , " weight" ));
2497+ vision_model.conv1d_2_b = get_tensor (string_format (TN_CONV1D, 2 , " bias" ));
2498+ vision_model.mm_fc_w = get_tensor (string_format (TN_MM_AUDIO_FC, " weight" ));
2499+ vision_model.mm_fc_b = get_tensor (string_format (TN_MM_AUDIO_FC, " bias" ));
2500+ } break ;
24662501 case PROJECTOR_TYPE_INTERNVL:
24672502 {
24682503 vision_model.mm_0_w = get_tensor (string_format (TN_MVLM_PROJ_MLP, 0 , " weight" ));
@@ -3450,6 +3485,10 @@ int clip_n_output_tokens(const struct clip_ctx * ctx, struct clip_image_f32 * im
34503485 const int proj_stack_factor = ctx->vision_model .hparams .proj_stack_factor ;
34513486 const int n_len = CLIP_ALIGN (img->nx , proj_stack_factor);
34523487 n_patches = n_len / proj_stack_factor / 2 ;
3488+ } else if (ctx->proj_type == PROJECTOR_TYPE_QWEN2A) {
3489+ // divide by 2 because of whisper
3490+ // another divide by 2 because of nn.AvgPool1d(2, stride=2)
3491+ n_patches = img->nx / 4 ;
34533492 }
34543493
34553494 return n_patches;
@@ -3850,6 +3889,7 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
38503889 case PROJECTOR_TYPE_GEMMA3:
38513890 case PROJECTOR_TYPE_IDEFICS3:
38523891 case PROJECTOR_TYPE_INTERNVL:
3892+ case PROJECTOR_TYPE_QWEN2A:
38533893 case PROJECTOR_TYPE_ULTRAVOX:
38543894 {
38553895 // do nothing
@@ -3910,7 +3950,7 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
39103950 const int n_tokens_out = embeddings->ne [1 ];
39113951 const int expected_n_tokens_out = clip_n_output_tokens (ctx, imgs.entries [0 ].get ());
39123952 if (n_tokens_out != expected_n_tokens_out) {
3913- LOG_ERR (" %s: expected %d tokens, got %d\n " , __func__, expected_n_tokens_out, n_tokens_out);
3953+ LOG_ERR (" %s: expected output %d tokens, got %d\n " , __func__, expected_n_tokens_out, n_tokens_out);
39143954 GGML_ABORT (" Invalid number of output tokens" );
39153955 }
39163956
@@ -3955,6 +3995,8 @@ int clip_n_mmproj_embd(const struct clip_ctx * ctx) {
39553995 return ctx->vision_model .mm_3_w ->ne [1 ];
39563996 case PROJECTOR_TYPE_LLAMA4:
39573997 return ctx->vision_model .mm_model_proj ->ne [1 ];
3998+ case PROJECTOR_TYPE_QWEN2A:
3999+ return ctx->vision_model .mm_fc_w ->ne [1 ];
39584000 default :
39594001 GGML_ABORT (" Unknown projector type" );
39604002 }
@@ -3991,6 +4033,10 @@ bool clip_has_audio_encoder(const struct clip_ctx * ctx) {
39914033 return ctx->vision_model .hparams .has_audio ;
39924034}
39934035
4036+ bool clip_has_whisper_encoder (const struct clip_ctx * ctx) {
4037+ return ctx->proj_type == PROJECTOR_TYPE_ULTRAVOX || ctx->proj_type == PROJECTOR_TYPE_QWEN2A;
4038+ }
4039+
39944040bool clip_encode_float_image (struct clip_ctx * ctx, int n_threads, float * img, int h, int w, float * vec) {
39954041 clip_image_f32 clip_img;
39964042 clip_img.buf .resize (h * w * 3 );
0 commit comments