@@ -254,7 +254,9 @@ struct clip_vision_model {
254254 ggml_tensor * post_ln_w;
255255 ggml_tensor * post_ln_b;
256256
257- ggml_tensor * projection;
257+ ggml_tensor * projection; // TODO: rename it to fc (fully connected layer)
258+ ggml_tensor * mm_fc_w;
259+ ggml_tensor * mm_fc_b;
258260
259261 // LLaVA projection
260262 ggml_tensor * mm_input_norm_w = nullptr ;
@@ -1471,48 +1473,58 @@ struct clip_graph {
14711473
14721474 cb (cur, " after_transformer" , -1 );
14731475
1474- // StackAudioFrames
1475- // https://huggingface.co/fixie-ai/ultravox-v0_5-llama-3_2-1b/blob/main/ultravox_model.py
1476- {
1477- int64_t stride = n_embd * hparams.proj_stack_factor ;
1478- int64_t padded_len = GGML_PAD (ggml_nelements (cur), stride);
1479- int64_t pad = padded_len - ggml_nelements (cur);
1480- if (pad > 0 ) {
1481- cur = ggml_view_1d (ctx0, cur, ggml_nelements (cur), 0 );
1482- cur = ggml_pad (ctx0, cur, pad, 0 , 0 , 0 );
1476+ if (ctx->proj_type == PROJECTOR_TYPE_ULTRAVOX) {
1477+ // StackAudioFrames
1478+ // https://huggingface.co/fixie-ai/ultravox-v0_5-llama-3_2-1b/blob/main/ultravox_model.py
1479+ {
1480+ int64_t stride = n_embd * hparams.proj_stack_factor ;
1481+ int64_t padded_len = GGML_PAD (ggml_nelements (cur), stride);
1482+ int64_t pad = padded_len - ggml_nelements (cur);
1483+ if (pad > 0 ) {
1484+ cur = ggml_view_1d (ctx0, cur, ggml_nelements (cur), 0 );
1485+ cur = ggml_pad (ctx0, cur, pad, 0 , 0 , 0 );
1486+ }
1487+ cur = ggml_view_2d (ctx0, cur, stride, padded_len / stride,
1488+ ggml_row_size (cur->type , stride), 0 );
14831489 }
1484- cur = ggml_view_2d (ctx0, cur, stride, padded_len / stride,
1485- ggml_row_size (cur->type , stride), 0 );
1486- }
14871490
1488- cb (cur, " after_stacked" , -1 );
1491+ cb (cur, " after_stacked" , -1 );
14891492
1490- // UltravoxProjector
1491- {
1492- // pre-norm
1493- cur = ggml_rms_norm (ctx0, cur, 1e-6 );
1494- cur = ggml_mul (ctx0, cur, model.mm_norm_pre_w );
1493+ // UltravoxProjector
1494+ {
1495+ // pre-norm
1496+ cur = ggml_rms_norm (ctx0, cur, 1e-6 );
1497+ cur = ggml_mul (ctx0, cur, model.mm_norm_pre_w );
14951498
1496- // ffn in
1497- cur = ggml_mul_mat (ctx0, model.mm_1_w , cur);
1499+ // ffn in
1500+ cur = ggml_mul_mat (ctx0, model.mm_1_w , cur);
14981501
1499- // swiglu
1500- {
1501- int64_t split_point = cur->ne [0 ] / 2 ;
1502- ggml_tensor * x0 = ggml_cont (ctx0, ggml_view_2d (ctx0, cur, split_point, cur->ne [1 ], cur->nb [1 ], 0 ));
1503- ggml_tensor * x1 = ggml_cont (ctx0, ggml_view_2d (ctx0, cur, split_point, cur->ne [1 ], cur->nb [1 ], split_point * ggml_element_size (cur)));
1502+ // swiglu
1503+ {
1504+ int64_t split_point = cur->ne [0 ] / 2 ;
1505+ ggml_tensor * x0 = ggml_cont (ctx0, ggml_view_2d (ctx0, cur, split_point, cur->ne [1 ], cur->nb [1 ], 0 ));
1506+ ggml_tensor * x1 = ggml_cont (ctx0, ggml_view_2d (ctx0, cur, split_point, cur->ne [1 ], cur->nb [1 ], split_point * ggml_element_size (cur)));
1507+
1508+ // see SwiGLU in ultravox_model.py, the second half passed through is silu, not the first half
1509+ x1 = ggml_silu (ctx0, x1);
1510+ cur = ggml_mul (ctx0, x0, x1);
1511+ }
15041512
1505- // see SwiGLU in ultravox_model.py, the second half passed through is silu, not the first half
1506- x1 = ggml_silu (ctx0, x1);
1507- cur = ggml_mul (ctx0, x0, x1);
1513+ // mid-norm
1514+ cur = ggml_rms_norm (ctx0, cur, 1e-6 );
1515+ cur = ggml_mul (ctx0, cur, model.mm_norm_mid_w );
1516+
1517+ // ffn out
1518+ cur = ggml_mul_mat (ctx0, model.mm_2_w , cur);
15081519 }
15091520
1510- // mid-norm
1511- cur = ggml_rms_norm (ctx0, cur, 1e-6 );
1512- cur = ggml_mul (ctx0, cur, model.mm_norm_mid_w );
1521+ } else if (ctx->proj_type == PROJECTOR_TYPE_QWEN2A) {
1522+ // projector
1523+ cur = ggml_mul_mat (ctx0, model.mm_fc_w , cur);
1524+ cur = ggml_add (ctx0, cur, model.mm_fc_b );
15131525
1514- // ffn out
1515- cur = ggml_mul_mat (ctx0, model. mm_2_w , cur );
1526+ } else {
1527+ GGML_ABORT ( " %s: unknown projector type " , __func__ );
15161528 }
15171529
15181530 cb (cur, " projected" , -1 );
@@ -1655,6 +1667,35 @@ struct clip_graph {
16551667 inpL = cur;
16561668 }
16571669
1670+ // TODO @ngxson : find a way to move this output of this function
1671+ if (ctx->proj_type == PROJECTOR_TYPE_QWEN2A) {
1672+ ggml_tensor * cur = inpL;
1673+ // trick: use sum_rows and ggml_scale instead of ggml_pool_1d
1674+ // because ggml_pool_1d is not supported on some GPU backend
1675+ // add padding if number of frames is not divisible by 2
1676+ /*
1677+ if (cur->ne[1] % 2 != 0) {
1678+ cur = ggml_pad(ctx0, cur, 0, 1, 0, 0);
1679+ }
1680+ cur = ggml_reshape_3d(ctx0, cur, cur->ne[0], 2, cur->ne[1]/2); // [n_embd, 2, n_frames/2]
1681+ cur = ggml_transpose(ctx0, cur); // [2, n_embd, n_frames/2]
1682+ // calc mean value
1683+ {
1684+ cur = ggml_cast(ctx0, cur, GGML_TYPE_F32);
1685+ cur = ggml_sum_rows(ctx0, cur); // [1, n_embd, n_frames/2]
1686+ cur = ggml_scale(ctx0, cur, 0.5f);
1687+ }
1688+ cur = ggml_transpose(ctx0, cur); // [n_embd, 1, n_frames/2]
1689+ cur = ggml_reshape_2d(ctx0, cur, cur->ne[0], cur->ne[2]); // [n_embd, n_frames/2]
1690+ */
1691+ cur = ggml_transpose (ctx0, cur);
1692+ cur = ggml_cast (ctx0, cur, GGML_TYPE_F32);
1693+ cur = ggml_pool_1d (ctx0, cur, GGML_OP_POOL_AVG, 2 , 2 , 0 );
1694+ cur = ggml_transpose (ctx0, cur);
1695+ cur = ggml_cast (ctx0, cur, GGML_TYPE_F32);
1696+ inpL = cur;
1697+ }
1698+
16581699 // post-layernorm
16591700 if (model.post_ln_w ) {
16601701 inpL = build_norm (inpL, model.post_ln_w , model.post_ln_b , norm_t , eps, -1 );
@@ -1952,6 +1993,7 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
19521993 res = graph.build_llama4 ();
19531994 } break ;
19541995 case PROJECTOR_TYPE_ULTRAVOX:
1996+ case PROJECTOR_TYPE_QWEN2A:
19551997 {
19561998 res = graph.build_whisper_enc ();
19571999 } break ;
@@ -2186,8 +2228,10 @@ struct clip_model_loader {
21862228 };
21872229 } break ;
21882230 case PROJECTOR_TYPE_ULTRAVOX:
2231+ case PROJECTOR_TYPE_QWEN2A:
21892232 {
2190- get_u32 (KEY_A_PROJ_STACK_FACTOR, hparams.proj_stack_factor );
2233+ bool require_stack = ctx_clip.proj_type == PROJECTOR_TYPE_ULTRAVOX;
2234+ get_u32 (KEY_A_PROJ_STACK_FACTOR, hparams.proj_stack_factor , require_stack);
21912235 if (hparams.n_mel_bins != 128 ) {
21922236 throw std::runtime_error (string_format (" %s: only 128 mel bins are supported for ultravox\n " , __func__));
21932237 }
@@ -2266,7 +2310,7 @@ struct clip_model_loader {
22662310 return cur;
22672311 };
22682312
2269- auto & vision_model = ctx_clip.vision_model ;
2313+ auto & vision_model = ctx_clip.vision_model ; // TODO: rename this to just "model"
22702314
22712315 vision_model.class_embedding = get_tensor (TN_CLASS_EMBD, false );
22722316
@@ -2463,6 +2507,15 @@ struct clip_model_loader {
24632507 vision_model.mm_norm_pre_w = get_tensor (string_format (TN_MM_NORM_PRE, " weight" ));
24642508 vision_model.mm_norm_mid_w = get_tensor (string_format (TN_MM_NORM_MID, " weight" ));
24652509 } break ;
2510+ case PROJECTOR_TYPE_QWEN2A:
2511+ {
2512+ vision_model.conv1d_1_w = get_tensor (string_format (TN_CONV1D, 1 , " weight" ));
2513+ vision_model.conv1d_1_b = get_tensor (string_format (TN_CONV1D, 1 , " bias" ));
2514+ vision_model.conv1d_2_w = get_tensor (string_format (TN_CONV1D, 2 , " weight" ));
2515+ vision_model.conv1d_2_b = get_tensor (string_format (TN_CONV1D, 2 , " bias" ));
2516+ vision_model.mm_fc_w = get_tensor (string_format (TN_MM_AUDIO_FC, " weight" ));
2517+ vision_model.mm_fc_b = get_tensor (string_format (TN_MM_AUDIO_FC, " bias" ));
2518+ } break ;
24662519 case PROJECTOR_TYPE_INTERNVL:
24672520 {
24682521 vision_model.mm_0_w = get_tensor (string_format (TN_MVLM_PROJ_MLP, 0 , " weight" ));
@@ -3450,6 +3503,10 @@ int clip_n_output_tokens(const struct clip_ctx * ctx, struct clip_image_f32 * im
34503503 const int proj_stack_factor = ctx->vision_model .hparams .proj_stack_factor ;
34513504 const int n_len = CLIP_ALIGN (img->nx , proj_stack_factor);
34523505 n_patches = n_len / proj_stack_factor / 2 ;
3506+ } else if (ctx->proj_type == PROJECTOR_TYPE_QWEN2A) {
3507+ // divide by 2 because of whisper
3508+ // another divide by 2 because of nn.AvgPool1d(2, stride=2)
3509+ n_patches = img->nx / 4 ;
34533510 }
34543511
34553512 return n_patches;
@@ -3850,6 +3907,7 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
38503907 case PROJECTOR_TYPE_GEMMA3:
38513908 case PROJECTOR_TYPE_IDEFICS3:
38523909 case PROJECTOR_TYPE_INTERNVL:
3910+ case PROJECTOR_TYPE_QWEN2A:
38533911 case PROJECTOR_TYPE_ULTRAVOX:
38543912 {
38553913 // do nothing
@@ -3910,7 +3968,7 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
39103968 const int n_tokens_out = embeddings->ne [1 ];
39113969 const int expected_n_tokens_out = clip_n_output_tokens (ctx, imgs.entries [0 ].get ());
39123970 if (n_tokens_out != expected_n_tokens_out) {
3913- LOG_ERR (" %s: expected %d tokens, got %d\n " , __func__, expected_n_tokens_out, n_tokens_out);
3971+ LOG_ERR (" %s: expected output %d tokens, got %d\n " , __func__, expected_n_tokens_out, n_tokens_out);
39143972 GGML_ABORT (" Invalid number of output tokens" );
39153973 }
39163974
@@ -3955,6 +4013,8 @@ int clip_n_mmproj_embd(const struct clip_ctx * ctx) {
39554013 return ctx->vision_model .mm_3_w ->ne [1 ];
39564014 case PROJECTOR_TYPE_LLAMA4:
39574015 return ctx->vision_model .mm_model_proj ->ne [1 ];
4016+ case PROJECTOR_TYPE_QWEN2A:
4017+ return ctx->vision_model .mm_fc_w ->ne [1 ];
39584018 default :
39594019 GGML_ABORT (" Unknown projector type" );
39604020 }
@@ -3991,6 +4051,10 @@ bool clip_has_audio_encoder(const struct clip_ctx * ctx) {
39914051 return ctx->vision_model .hparams .has_audio ;
39924052}
39934053
4054+ bool clip_has_whisper_encoder (const struct clip_ctx * ctx) {
4055+ return ctx->proj_type == PROJECTOR_TYPE_GEMMA3 || ctx->proj_type == PROJECTOR_TYPE_QWEN2A;
4056+ }
4057+
39944058bool clip_encode_float_image (struct clip_ctx * ctx, int n_threads, float * img, int h, int w, float * vec) {
39954059 clip_image_f32 clip_img;
39964060 clip_img.buf .resize (h * w * 3 );
0 commit comments