Skip to content

Commit 86339b0

Browse files
committed
fix conficlt with upstream
1 parent 7da7853 commit 86339b0

File tree

4 files changed

+40
-38
lines changed

4 files changed

+40
-38
lines changed

tools/mtmd/clip-graph.h

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -112,4 +112,8 @@ struct clip_graph {
112112
// aka pixel_shuffle / pixel_unshuffle / patch_merger (Kimi-VL)
113113
// support dynamic resolution
114114
ggml_tensor * build_patch_merge_permute(ggml_tensor * cur, int scale_factor);
115+
116+
// Generic function to stack frames for audio processing
117+
// Abstracts out the StackAudioFrames logic used by ultravox
118+
ggml_tensor * build_stack(ggml_tensor * cur, int32_t stack_factor, int32_t n_embed);
115119
};

tools/mtmd/clip-model.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -256,6 +256,7 @@ struct clip_model {
256256
ggml_tensor * conv1d_2_w = nullptr;
257257
ggml_tensor * conv1d_2_b = nullptr;
258258
ggml_tensor * mm_norm_pre_w = nullptr;
259+
ggml_tensor * mm_norm_pre_b = nullptr;
259260
ggml_tensor * mm_norm_mid_w = nullptr;
260261

261262
// cogvlm

tools/mtmd/clip.cpp

Lines changed: 26 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -720,6 +720,32 @@ ggml_tensor * clip_graph::build_rope_2d(
720720
return cur;
721721
}
722722

723+
// Generic function to stack frames for audio processing
724+
// Abstracts out the StackAudioFrames logic used by ultravox
725+
ggml_tensor * clip_graph::build_stack(ggml_tensor * cur, int32_t stack_factor, int32_t n_embed) {
726+
if (stack_factor <= 1) {
727+
return cur;
728+
}
729+
730+
int64_t total_elements = ggml_nelements(cur);
731+
int64_t stride = n_embed * stack_factor;
732+
733+
// Calculate padded length
734+
int64_t padded_len = GGML_PAD(total_elements, stride);
735+
int64_t pad = padded_len - total_elements;
736+
737+
if (pad > 0) {
738+
// Pad the tensor to make it divisible by stride
739+
cur = ggml_view_1d(ctx0, cur, total_elements, 0);
740+
cur = ggml_pad(ctx0, cur, pad, 0, 0, 0);
741+
}
742+
743+
// Reshape to [stride, padded_len / stride]
744+
cur = ggml_view_2d(ctx0, cur, stride, padded_len / stride,
745+
ggml_row_size(cur->type, stride), 0);
746+
return cur;
747+
}
748+
723749
// aka pixel_shuffle / pixel_unshuffle / patch_merger (Kimi-VL)
724750
// support dynamic resolution
725751
ggml_tensor * clip_graph::build_patch_merge_permute(ggml_tensor * cur, int scale_factor) {
@@ -753,34 +779,6 @@ ggml_tensor * clip_graph::build_patch_merge_permute(ggml_tensor * cur, int scale
753779
return cur;
754780
}
755781

756-
// Generic function to stack frames for audio processing
757-
// Abstracts out the StackAudioFrames logic used by ultravox
758-
ggml_tensor * build_stack(ggml_tensor * cur, int32_t stack_factor, int32_t n_embed) {
759-
if (stack_factor <= 1) {
760-
return cur;
761-
}
762-
763-
int64_t total_elements = ggml_nelements(cur);
764-
int64_t stride = n_embed * stack_factor;
765-
766-
// Calculate padded length
767-
int64_t padded_len = GGML_PAD(total_elements, stride);
768-
int64_t pad = padded_len - total_elements;
769-
770-
if (pad > 0) {
771-
// Pad the tensor to make it divisible by stride
772-
cur = ggml_view_1d(ctx0, cur, total_elements, 0);
773-
cur = ggml_pad(ctx0, cur, pad, 0, 0, 0);
774-
}
775-
776-
// Reshape to [stride, padded_len / stride]
777-
cur = ggml_view_2d(ctx0, cur, stride, padded_len / stride,
778-
ggml_row_size(cur->type, stride), 0);
779-
return cur;
780-
}
781-
782-
};
783-
784782
static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32_batch & imgs) {
785783
GGML_ASSERT(imgs.entries.size() == 1 && "n_batch > 1 is not supported");
786784

tools/mtmd/models/whisper-enc.cpp

Lines changed: 9 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -30,7 +30,6 @@ ggml_cgraph * clip_graph_whisper_enc::build() {
3030
GGML_ASSERT(model.layers[0].q_b);
3131
GGML_ASSERT(model.layers[0].v_b);
3232
GGML_ASSERT(!model.layers[0].k_b); // no bias for k
33-
GGML_ASSERT(model.post_ln_w && model.post_ln_b);
3433

3534
ggml_tensor * pos_embd_selected = ggml_view_2d(
3635
ctx0, model.position_embeddings,
@@ -49,15 +48,7 @@ ggml_cgraph * clip_graph_whisper_enc::build() {
4948
if (model.audio_has_stack_frames()) {
5049
// StackAudioFrames
5150
// https://huggingface.co/fixie-ai/ultravox-v0_5-llama-3_2-1b/blob/main/ultravox_model.py
52-
int64_t stride = n_embd * hparams.proj_stack_factor;
53-
int64_t padded_len = GGML_PAD(ggml_nelements(cur), stride);
54-
int64_t pad = padded_len - ggml_nelements(cur);
55-
if (pad > 0) {
56-
cur = ggml_view_1d(ctx0, cur, ggml_nelements(cur), 0);
57-
cur = ggml_pad(ctx0, cur, pad, 0, 0, 0);
58-
}
59-
cur = ggml_view_2d(ctx0, cur, stride, padded_len / stride,
60-
ggml_row_size(cur->type, stride), 0);
51+
cur = build_stack(cur, hparams.proj_stack_factor, n_embd);
6152
cb(cur, "after_stacked", -1);
6253
}
6354

@@ -95,6 +86,14 @@ ggml_cgraph * clip_graph_whisper_enc::build() {
9586
FFN_GELU_ERF,
9687
-1);
9788

89+
} else if (proj_type == PROJECTOR_TYPE_GLMA) {
90+
cur = ggml_norm(ctx0, cur, hparams.eps);
91+
cur = ggml_mul(ctx0, cur, model.mm_norm_pre_w);
92+
cur = ggml_add(ctx0, cur, model.mm_norm_pre_b);
93+
cur = build_stack(cur, hparams.proj_stack_factor, n_embd);
94+
cur = build_ffn(cur, model.mm_1_w, model.mm_1_b, nullptr, nullptr, model.mm_2_w, model.mm_2_b, hparams.ffn_op, 0);
95+
cur = ggml_concat(ctx0, model.mm_boi, cur, 1);
96+
cur = ggml_concat(ctx0, cur, model.mm_eoi, 1);
9897
} else {
9998
GGML_ABORT("%s: unknown projector type", __func__);
10099
}

0 commit comments

Comments
 (0)