@@ -720,6 +720,32 @@ ggml_tensor * clip_graph::build_rope_2d(
720720 return cur;
721721}
722722
723+ // Generic function to stack frames for audio processing
724+ // Abstracts out the StackAudioFrames logic used by ultravox
725+ ggml_tensor * clip_graph::build_stack (ggml_tensor * cur, int32_t stack_factor, int32_t n_embed) {
726+ if (stack_factor <= 1 ) {
727+ return cur;
728+ }
729+
730+ int64_t total_elements = ggml_nelements (cur);
731+ int64_t stride = n_embed * stack_factor;
732+
733+ // Calculate padded length
734+ int64_t padded_len = GGML_PAD (total_elements, stride);
735+ int64_t pad = padded_len - total_elements;
736+
737+ if (pad > 0 ) {
738+ // Pad the tensor to make it divisible by stride
739+ cur = ggml_view_1d (ctx0, cur, total_elements, 0 );
740+ cur = ggml_pad (ctx0, cur, pad, 0 , 0 , 0 );
741+ }
742+
743+ // Reshape to [stride, padded_len / stride]
744+ cur = ggml_view_2d (ctx0, cur, stride, padded_len / stride,
745+ ggml_row_size (cur->type , stride), 0 );
746+ return cur;
747+ }
748+
723749// aka pixel_shuffle / pixel_unshuffle / patch_merger (Kimi-VL)
724750// support dynamic resolution
725751ggml_tensor * clip_graph::build_patch_merge_permute (ggml_tensor * cur, int scale_factor) {
@@ -753,34 +779,6 @@ ggml_tensor * clip_graph::build_patch_merge_permute(ggml_tensor * cur, int scale
753779 return cur;
754780}
755781
756- // Generic function to stack frames for audio processing
757- // Abstracts out the StackAudioFrames logic used by ultravox
758- ggml_tensor * build_stack (ggml_tensor * cur, int32_t stack_factor, int32_t n_embed) {
759- if (stack_factor <= 1 ) {
760- return cur;
761- }
762-
763- int64_t total_elements = ggml_nelements (cur);
764- int64_t stride = n_embed * stack_factor;
765-
766- // Calculate padded length
767- int64_t padded_len = GGML_PAD (total_elements, stride);
768- int64_t pad = padded_len - total_elements;
769-
770- if (pad > 0 ) {
771- // Pad the tensor to make it divisible by stride
772- cur = ggml_view_1d (ctx0, cur, total_elements, 0 );
773- cur = ggml_pad (ctx0, cur, pad, 0 , 0 , 0 );
774- }
775-
776- // Reshape to [stride, padded_len / stride]
777- cur = ggml_view_2d (ctx0, cur, stride, padded_len / stride,
778- ggml_row_size (cur->type , stride), 0 );
779- return cur;
780- }
781-
782- };
783-
784782static ggml_cgraph * clip_image_build_graph (clip_ctx * ctx, const clip_image_f32_batch & imgs) {
785783 GGML_ASSERT (imgs.entries .size () == 1 && " n_batch > 1 is not supported" );
786784
0 commit comments