@@ -1848,15 +1848,7 @@ struct clip_graph {
18481848 if (model.audio_has_stack_frames ()) {
18491849 // StackAudioFrames
18501850 // https://huggingface.co/fixie-ai/ultravox-v0_5-llama-3_2-1b/blob/main/ultravox_model.py
1851- int64_t stride = n_embd * hparams.proj_stack_factor ;
1852- int64_t padded_len = GGML_PAD (ggml_nelements (cur), stride);
1853- int64_t pad = padded_len - ggml_nelements (cur);
1854- if (pad > 0 ) {
1855- cur = ggml_view_1d (ctx0, cur, ggml_nelements (cur), 0 );
1856- cur = ggml_pad (ctx0, cur, pad, 0 , 0 , 0 );
1857- }
1858- cur = ggml_view_2d (ctx0, cur, stride, padded_len / stride,
1859- ggml_row_size (cur->type , stride), 0 );
1851+ cur = build_stack (cur, hparams.proj_stack_factor , n_embd);
18601852 cb (cur, " after_stacked" , -1 );
18611853 }
18621854
@@ -1895,12 +1887,8 @@ struct clip_graph {
18951887 cur = ggml_norm (ctx0, cur, hparams.eps );
18961888 cur = ggml_mul (ctx0, cur, model.mm_norm_pre_w );
18971889 cur = ggml_add (ctx0, cur, model.mm_norm_pre_b );
1898- cur = ggml_reshape_2d (ctx0, cur, cur->ne [0 ] * 4 , cur->ne [1 ] / 4 );
1899- cur = ggml_mul_mat (ctx0, model.mm_1_w , cur);
1900- cur = ggml_add (ctx0, cur, model.mm_1_b );
1901- cur = ggml_gelu_erf (ctx0, cur);
1902- cur = ggml_mul_mat (ctx0, model.mm_2_w , cur);
1903- cur = ggml_add (ctx0, cur, model.mm_2_b );
1890+ cur = build_stack (cur, hparams.proj_stack_factor , n_embd);
1891+ cur = build_ffn (cur, model.mm_1_w , model.mm_1_b , nullptr , nullptr , model.mm_2_w , model.mm_2_b , hparams.ffn_op , 0 );
19041892 cur = ggml_concat (ctx0, model.mm_boi , cur, 1 );
19051893 cur = ggml_concat (ctx0, cur, model.mm_eoi , 1 );
19061894 } else {
@@ -2486,6 +2474,32 @@ struct clip_graph {
24862474 return cur;
24872475 }
24882476
2477+ // Generic function to stack frames for audio processing
2478+ // Abstracts out the StackAudioFrames logic used by ultravox
2479+ ggml_tensor * build_stack (ggml_tensor * cur, int32_t stack_factor, int32_t n_embed) {
2480+ if (stack_factor <= 1 ) {
2481+ return cur;
2482+ }
2483+
2484+ int64_t total_elements = ggml_nelements (cur);
2485+ int64_t stride = n_embed * stack_factor;
2486+
2487+ // Calculate padded length
2488+ int64_t padded_len = GGML_PAD (total_elements, stride);
2489+ int64_t pad = padded_len - total_elements;
2490+
2491+ if (pad > 0 ) {
2492+ // Pad the tensor to make it divisible by stride
2493+ cur = ggml_view_1d (ctx0, cur, total_elements, 0 );
2494+ cur = ggml_pad (ctx0, cur, pad, 0 , 0 , 0 );
2495+ }
2496+
2497+ // Reshape to [stride, padded_len / stride]
2498+ cur = ggml_view_2d (ctx0, cur, stride, padded_len / stride,
2499+ ggml_row_size (cur->type , stride), 0 );
2500+ return cur;
2501+ }
2502+
24892503};
24902504
24912505static ggml_cgraph * clip_image_build_graph (clip_ctx * ctx, const clip_image_f32_batch & imgs) {
@@ -2864,10 +2878,12 @@ struct clip_model_loader {
28642878 } break ;
28652879 case PROJECTOR_TYPE_ULTRAVOX:
28662880 case PROJECTOR_TYPE_QWEN2A:
2881+ case PROJECTOR_TYPE_GLMA:
28672882 case PROJECTOR_TYPE_VOXTRAL:
28682883 {
28692884 bool require_stack = model.proj_type == PROJECTOR_TYPE_ULTRAVOX ||
2870- model.proj_type == PROJECTOR_TYPE_VOXTRAL;
2885+ model.proj_type == PROJECTOR_TYPE_VOXTRAL ||
2886+ model.proj_type == PROJECTOR_TYPE_GLMA;
28712887 get_u32 (KEY_A_PROJ_STACK_FACTOR, hparams.proj_stack_factor , require_stack);
28722888 if (hparams.n_mel_bins != 128 ) {
28732889 throw std::runtime_error (string_format (" %s: only 128 mel bins are supported for ultravox\n " , __func__));
@@ -4640,7 +4656,7 @@ int clip_n_output_tokens(const struct clip_ctx * ctx, struct clip_image_f32 * im
46404656 // whisper downscales input token by half after conv1d
46414657 n_patches /= 2 ;
46424658 // reshape by merge_factor
4643- n_patches /= 4 ;
4659+ n_patches /= ctx-> model . hparams . proj_stack_factor ;
46444660 // for BOI and EOI token embeddings
46454661 n_patches += 2 ;
46464662 } break ;
0 commit comments