Skip to content

Commit 5fc3507

Browse files
committed
clean up
1 parent cd909c2 commit 5fc3507

File tree

1 file changed

+38
-30
lines changed

1 file changed

+38
-30
lines changed

tools/mtmd/clip.cpp

Lines changed: 38 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -354,6 +354,16 @@ struct clip_model {
354354
ggml_tensor * conv1d_2_b = nullptr;
355355
ggml_tensor * mm_norm_pre_w = nullptr;
356356
ggml_tensor * mm_norm_mid_w = nullptr;
357+
358+
bool audio_has_avgpool() const {
359+
return proj_type == PROJECTOR_TYPE_QWEN2A
360+
|| proj_type == PROJECTOR_TYPE_VOXTRAL;
361+
}
362+
363+
bool audio_has_stack_frames() const {
364+
return proj_type == PROJECTOR_TYPE_ULTRAVOX
365+
|| proj_type == PROJECTOR_TYPE_VOXTRAL;
366+
}
357367
};
358368

359369
struct clip_ctx {
@@ -1483,10 +1493,22 @@ struct clip_graph {
14831493

14841494
cb(cur, "after_transformer", -1);
14851495

1486-
if (ctx->proj_type() == PROJECTOR_TYPE_ULTRAVOX) {
1487-
cur = build_whisper_stack_audio_frames(cur);
1496+
if (model.audio_has_stack_frames()) {
1497+
// StackAudioFrames
1498+
// https://huggingface.co/fixie-ai/ultravox-v0_5-llama-3_2-1b/blob/main/ultravox_model.py
1499+
int64_t stride = n_embd * hparams.proj_stack_factor;
1500+
int64_t padded_len = GGML_PAD(ggml_nelements(cur), stride);
1501+
int64_t pad = padded_len - ggml_nelements(cur);
1502+
if (pad > 0) {
1503+
cur = ggml_view_1d(ctx0, cur, ggml_nelements(cur), 0);
1504+
cur = ggml_pad(ctx0, cur, pad, 0, 0, 0);
1505+
}
1506+
cur = ggml_view_2d(ctx0, cur, stride, padded_len / stride,
1507+
ggml_row_size(cur->type, stride), 0);
14881508
cb(cur, "after_stacked", -1);
1509+
}
14891510

1511+
if (ctx->proj_type() == PROJECTOR_TYPE_ULTRAVOX) {
14901512
// UltravoxProjector
14911513
{
14921514
// pre-norm
@@ -1514,7 +1536,7 @@ struct clip_graph {
15141536
cur = ggml_add(ctx0, cur, model.mm_fc_b);
15151537

15161538
} else if (ctx->proj_type() == PROJECTOR_TYPE_VOXTRAL) {
1517-
cur = build_whisper_stack_audio_frames(cur);
1539+
// projector
15181540
cb(cur, "after_stacked", -1);
15191541
cur = ggml_mul_mat(ctx0, model.mm_1_w, cur);
15201542
cur = ggml_relu(ctx0, cur);
@@ -1531,21 +1553,6 @@ struct clip_graph {
15311553
return gf;
15321554
}
15331555

1534-
ggml_tensor * build_whisper_stack_audio_frames(ggml_tensor * cur) {
1535-
// StackAudioFrames
1536-
// https://huggingface.co/fixie-ai/ultravox-v0_5-llama-3_2-1b/blob/main/ultravox_model.py
1537-
int64_t stride = n_embd * hparams.proj_stack_factor;
1538-
int64_t padded_len = GGML_PAD(ggml_nelements(cur), stride);
1539-
int64_t pad = padded_len - ggml_nelements(cur);
1540-
if (pad > 0) {
1541-
cur = ggml_view_1d(ctx0, cur, ggml_nelements(cur), 0);
1542-
cur = ggml_pad(ctx0, cur, pad, 0, 0, 0);
1543-
}
1544-
cur = ggml_view_2d(ctx0, cur, stride, padded_len / stride,
1545-
ggml_row_size(cur->type, stride), 0);
1546-
return cur;
1547-
}
1548-
15491556
private:
15501557
//
15511558
// utility functions
@@ -1679,8 +1686,7 @@ struct clip_graph {
16791686
inpL = cur;
16801687
}
16811688

1682-
// TODO @ngxson : find a way to move this outside
1683-
if (ctx->proj_type() == PROJECTOR_TYPE_QWEN2A || ctx->proj_type() == PROJECTOR_TYPE_VOXTRAL) {
1689+
if (ctx->model.audio_has_avgpool()) {
16841690
ggml_tensor * cur = inpL;
16851691
cur = ggml_transpose(ctx0, cur);
16861692
cur = ggml_cont(ctx0, cur);
@@ -3593,21 +3599,23 @@ int clip_n_output_tokens(const struct clip_ctx * ctx, struct clip_image_f32 * im
35933599
} break;
35943600
case PROJECTOR_TYPE_VOXTRAL:
35953601
case PROJECTOR_TYPE_ULTRAVOX:
3602+
case PROJECTOR_TYPE_QWEN2A:
35963603
{
3604+
// whisper downscales input token by half after conv1d
3605+
n_patches_sq = img->nx / 2;
3606+
35973607
const int proj_stack_factor = ctx->model.hparams.proj_stack_factor;
3598-
const int n_len = CLIP_ALIGN(img->nx, proj_stack_factor);
3599-
n_patches_sq = n_len / proj_stack_factor / 2;
3608+
if (ctx->model.audio_has_stack_frames()) {
3609+
GGML_ASSERT(proj_stack_factor > 0);
3610+
const int n_len = CLIP_ALIGN(n_patches_sq, proj_stack_factor);
3611+
n_patches_sq = n_len / proj_stack_factor;
3612+
}
36003613

3601-
if (proj == PROJECTOR_TYPE_VOXTRAL) {
3602-
n_patches_sq /= 2; // divide by 2 because of nn.AvgPool1d(2, stride=2)
3614+
if (ctx->model.audio_has_avgpool()) {
3615+
// divide by 2 because of nn.AvgPool1d(2, stride=2)
3616+
n_patches_sq /= 2;
36033617
}
36043618
} break;
3605-
case PROJECTOR_TYPE_QWEN2A:
3606-
{
3607-
// divide by 2 because of whisper
3608-
// another divide by 2 because of nn.AvgPool1d(2, stride=2)
3609-
n_patches_sq = img->nx / 4;
3610-
} break;
36113619
default:
36123620
GGML_ABORT("unsupported projector type");
36133621
}

0 commit comments

Comments
 (0)