@@ -354,6 +354,16 @@ struct clip_model {
354354 ggml_tensor * conv1d_2_b = nullptr ;
355355 ggml_tensor * mm_norm_pre_w = nullptr ;
356356 ggml_tensor * mm_norm_mid_w = nullptr ;
357+
358+ bool audio_has_avgpool () const {
359+ return proj_type == PROJECTOR_TYPE_QWEN2A
360+ || proj_type == PROJECTOR_TYPE_VOXTRAL;
361+ }
362+
363+ bool audio_has_stack_frames () const {
364+ return proj_type == PROJECTOR_TYPE_ULTRAVOX
365+ || proj_type == PROJECTOR_TYPE_VOXTRAL;
366+ }
357367};
358368
359369struct clip_ctx {
@@ -1483,10 +1493,22 @@ struct clip_graph {
14831493
14841494 cb (cur, " after_transformer" , -1 );
14851495
1486- if (ctx->proj_type () == PROJECTOR_TYPE_ULTRAVOX) {
1487- cur = build_whisper_stack_audio_frames (cur);
1496+ if (model.audio_has_stack_frames ()) {
1497+ // StackAudioFrames
1498+ // https://huggingface.co/fixie-ai/ultravox-v0_5-llama-3_2-1b/blob/main/ultravox_model.py
1499+ int64_t stride = n_embd * hparams.proj_stack_factor ;
1500+ int64_t padded_len = GGML_PAD (ggml_nelements (cur), stride);
1501+ int64_t pad = padded_len - ggml_nelements (cur);
1502+ if (pad > 0 ) {
1503+ cur = ggml_view_1d (ctx0, cur, ggml_nelements (cur), 0 );
1504+ cur = ggml_pad (ctx0, cur, pad, 0 , 0 , 0 );
1505+ }
1506+ cur = ggml_view_2d (ctx0, cur, stride, padded_len / stride,
1507+ ggml_row_size (cur->type , stride), 0 );
14881508 cb (cur, " after_stacked" , -1 );
1509+ }
14891510
1511+ if (ctx->proj_type () == PROJECTOR_TYPE_ULTRAVOX) {
14901512 // UltravoxProjector
14911513 {
14921514 // pre-norm
@@ -1514,7 +1536,7 @@ struct clip_graph {
15141536 cur = ggml_add (ctx0, cur, model.mm_fc_b );
15151537
15161538 } else if (ctx->proj_type () == PROJECTOR_TYPE_VOXTRAL) {
1517- cur = build_whisper_stack_audio_frames (cur);
1539+ // projector
15181540 cb (cur, " after_stacked" , -1 );
15191541 cur = ggml_mul_mat (ctx0, model.mm_1_w , cur);
15201542 cur = ggml_relu (ctx0, cur);
@@ -1531,21 +1553,6 @@ struct clip_graph {
15311553 return gf;
15321554 }
15331555
1534- ggml_tensor * build_whisper_stack_audio_frames (ggml_tensor * cur) {
1535- // StackAudioFrames
1536- // https://huggingface.co/fixie-ai/ultravox-v0_5-llama-3_2-1b/blob/main/ultravox_model.py
1537- int64_t stride = n_embd * hparams.proj_stack_factor ;
1538- int64_t padded_len = GGML_PAD (ggml_nelements (cur), stride);
1539- int64_t pad = padded_len - ggml_nelements (cur);
1540- if (pad > 0 ) {
1541- cur = ggml_view_1d (ctx0, cur, ggml_nelements (cur), 0 );
1542- cur = ggml_pad (ctx0, cur, pad, 0 , 0 , 0 );
1543- }
1544- cur = ggml_view_2d (ctx0, cur, stride, padded_len / stride,
1545- ggml_row_size (cur->type , stride), 0 );
1546- return cur;
1547- }
1548-
15491556private:
15501557 //
15511558 // utility functions
@@ -1679,8 +1686,7 @@ struct clip_graph {
16791686 inpL = cur;
16801687 }
16811688
1682- // TODO @ngxson : find a way to move this outside
1683- if (ctx->proj_type () == PROJECTOR_TYPE_QWEN2A || ctx->proj_type () == PROJECTOR_TYPE_VOXTRAL) {
1689+ if (ctx->model .audio_has_avgpool ()) {
16841690 ggml_tensor * cur = inpL;
16851691 cur = ggml_transpose (ctx0, cur);
16861692 cur = ggml_cont (ctx0, cur);
@@ -3593,21 +3599,23 @@ int clip_n_output_tokens(const struct clip_ctx * ctx, struct clip_image_f32 * im
35933599 } break ;
35943600 case PROJECTOR_TYPE_VOXTRAL:
35953601 case PROJECTOR_TYPE_ULTRAVOX:
3602+ case PROJECTOR_TYPE_QWEN2A:
35963603 {
3604+ // whisper downscales input token by half after conv1d
3605+ n_patches_sq = img->nx / 2 ;
3606+
35973607 const int proj_stack_factor = ctx->model .hparams .proj_stack_factor ;
3598- const int n_len = CLIP_ALIGN (img->nx , proj_stack_factor);
3599- n_patches_sq = n_len / proj_stack_factor / 2 ;
3608+ if (ctx->model .audio_has_stack_frames ()) {
3609+ GGML_ASSERT (proj_stack_factor > 0 );
3610+ const int n_len = CLIP_ALIGN (n_patches_sq, proj_stack_factor);
3611+ n_patches_sq = n_len / proj_stack_factor;
3612+ }
36003613
3601- if (proj == PROJECTOR_TYPE_VOXTRAL) {
3602- n_patches_sq /= 2 ; // divide by 2 because of nn.AvgPool1d(2, stride=2)
3614+ if (ctx->model .audio_has_avgpool ()) {
3615+ // divide by 2 because of nn.AvgPool1d(2, stride=2)
3616+ n_patches_sq /= 2 ;
36033617 }
36043618 } break ;
3605- case PROJECTOR_TYPE_QWEN2A:
3606- {
3607- // divide by 2 because of whisper
3608- // another divide by 2 because of nn.AvgPool1d(2, stride=2)
3609- n_patches_sq = img->nx / 4 ;
3610- } break ;
36113619 default :
36123620 GGML_ABORT (" unsupported projector type" );
36133621 }
0 commit comments