Skip to content

Commit de20afd

Browse files
committed
build_ultravox()
1 parent bc708b4 commit de20afd

File tree

1 file changed

+124
-26
lines changed

1 file changed

+124
-26
lines changed

tools/mtmd/clip.cpp

Lines changed: 124 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -1321,6 +1321,103 @@ struct clip_graph {
13211321
return gf;
13221322
}
13231323

1324+
// whisper encoder with ultravox projector
1325+
ggml_cgraph * build_ultravox() {
1326+
const int n_step = img.nx;
1327+
const int n_pos = n_step / 2;
1328+
1329+
ggml_tensor * inp = build_inp_raw(1);
1330+
1331+
ggml_tensor * positions = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_pos);
1332+
ggml_set_name(positions, "positions");
1333+
ggml_set_input(positions);
1334+
1335+
// conv1d block
1336+
{
1337+
// convolution + gelu
1338+
ggml_tensor * cur = ggml_conv_1d_ph(ctx0, model.conv1d_1_w, inp, 1, 1);
1339+
cur = ggml_add(ctx0, cur, model.conv1d_1_b);
1340+
1341+
cur = ggml_gelu(ctx0, cur);
1342+
1343+
cur = ggml_conv_1d_ph(ctx0, model.conv1d_2_w, cur, 2, 1);
1344+
cur = ggml_add(ctx0, cur, model.conv1d_2_b);
1345+
1346+
cur = ggml_gelu(ctx0, cur);
1347+
// transpose
1348+
inp = ggml_cont(ctx0, ggml_transpose(ctx0, cur));
1349+
cb(inp, "after_conv1d", -1);
1350+
}
1351+
1352+
// sanity check (only check one layer, but it should be the same for all)
1353+
GGML_ASSERT(model.layers[0].ln_1_w && model.layers[0].ln_1_b);
1354+
GGML_ASSERT(model.layers[0].ln_2_w && model.layers[0].ln_2_b);
1355+
GGML_ASSERT(model.layers[0].q_b);
1356+
GGML_ASSERT(model.layers[0].v_b);
1357+
GGML_ASSERT(!model.layers[0].k_b); // no bias for k
1358+
GGML_ASSERT(model.post_ln_w && model.post_ln_b);
1359+
1360+
ggml_tensor * pos_embd_selected = ggml_get_rows(ctx0, model.position_embeddings, positions);
1361+
ggml_tensor * cur = build_vit(
1362+
inp, n_pos,
1363+
NORM_TYPE_NORMAL,
1364+
hparams.ffn_op,
1365+
pos_embd_selected,
1366+
nullptr);
1367+
1368+
cb(cur, "after_transformer", -1);
1369+
1370+
// StackAudioFrames
1371+
// https://huggingface.co/fixie-ai/ultravox-v0_5-llama-3_2-1b/blob/main/ultravox_model.py
1372+
{
1373+
int64_t stride = n_embd * hparams.proj_stack_factor;
1374+
int64_t padded_len = GGML_PAD(ggml_nelements(cur), stride);
1375+
int64_t pad = padded_len - ggml_nelements(cur);
1376+
if (pad > 0) {
1377+
cur = ggml_view_1d(ctx0, cur, ggml_nelements(cur), 0);
1378+
cur = ggml_pad(ctx0, cur, pad, 0, 0, 0);
1379+
}
1380+
cur = ggml_view_2d(ctx0, cur, stride, padded_len / stride,
1381+
ggml_row_size(cur->type, stride), 0);
1382+
}
1383+
1384+
cb(cur, "after_stacked", -1);
1385+
1386+
// UltravoxProjector
1387+
{
1388+
// pre-norm
1389+
cur = ggml_rms_norm(ctx0, cur, 1e-6);
1390+
cur = ggml_mul(ctx0, cur, model.mm_norm_pre_w);
1391+
1392+
// ffn in
1393+
cur = ggml_mul_mat(ctx0, model.mm_1_w, cur);
1394+
1395+
// swiglu
1396+
{
1397+
int64_t split_point = cur->ne[0] / 2;
1398+
ggml_tensor * x0 = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, split_point, cur->ne[1], cur->nb[1], 0));
1399+
ggml_tensor * x1 = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, split_point, cur->ne[1], cur->nb[1], split_point * ggml_element_size(cur)));
1400+
1401+
// see SwiGLU in ultravox_model.py, the second half passed through is silu, not the first half
1402+
x1 = ggml_silu(ctx0, x1);
1403+
cur = ggml_mul(ctx0, x0, x1);
1404+
}
1405+
1406+
// mid-norm
1407+
cur = ggml_rms_norm(ctx0, cur, 1e-6);
1408+
cur = ggml_mul(ctx0, cur, model.mm_norm_mid_w);
1409+
1410+
// ffn out
1411+
cur = ggml_mul_mat(ctx0, model.mm_2_w, cur);
1412+
}
1413+
1414+
cb(cur, "projected", -1);
1415+
1416+
ggml_build_forward_expand(gf, cur);
1417+
1418+
return gf;
1419+
}
1420+
13241421
private:
13251422
//
13261423
// utility functions
@@ -1471,8 +1568,8 @@ struct clip_graph {
14711568
return inp;
14721569
}
14731570

1474-
ggml_tensor * build_inp_raw() {
1475-
ggml_tensor * inp_raw = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, img.nx, img.ny, 3);
1571+
ggml_tensor * build_inp_raw(int channels = 3) {
1572+
ggml_tensor * inp_raw = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, img.nx, img.ny, channels);
14761573
ggml_set_name(inp_raw, "inp_raw");
14771574
ggml_set_input(inp_raw);
14781575
return inp_raw;
@@ -1736,7 +1833,7 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
17361833
} break;
17371834
case PROJECTOR_TYPE_ULTRAVOX:
17381835
{
1739-
res = nullptr;
1836+
res = graph.build_ultravox();
17401837
} break;
17411838
default:
17421839
{
@@ -1945,6 +2042,7 @@ struct clip_model_loader {
19452042
case PROJECTOR_TYPE_ULTRAVOX:
19462043
{
19472044
get_u32(KEY_PROJ_STACK_FACTOR, hparams.proj_stack_factor);
2045+
hparams.ffn_op = FFN_GELU;
19482046
} break;
19492047
default:
19502048
break;
@@ -2029,31 +2127,31 @@ struct clip_model_loader {
20292127
vision_model.layers.resize(hparams.n_layer);
20302128
for (int il = 0; il < hparams.n_layer; ++il) {
20312129
auto & layer = vision_model.layers[il];
2032-
layer.k_w = get_tensor(string_format(TN_ATTN_K, "v", il, "weight"));
2033-
layer.q_w = get_tensor(string_format(TN_ATTN_Q, "v", il, "weight"));
2034-
layer.v_w = get_tensor(string_format(TN_ATTN_V, "v", il, "weight"));
2035-
layer.o_w = get_tensor(string_format(TN_ATTN_OUTPUT, "v", il, "weight"));
2036-
layer.k_norm = get_tensor(string_format(TN_ATTN_K_NORM, "v", il, "weight"), false);
2037-
layer.q_norm = get_tensor(string_format(TN_ATTN_Q_NORM, "v", il, "weight"), false);
2038-
layer.ln_1_w = get_tensor(string_format(TN_LN_1, "v", il, "weight"), false);
2039-
layer.ln_2_w = get_tensor(string_format(TN_LN_2, "v", il, "weight"), false);
2040-
layer.ls_1_w = get_tensor(string_format(TN_LS_1, "v", il, "weight"), false); // no bias
2041-
layer.ls_2_w = get_tensor(string_format(TN_LS_2, "v", il, "weight"), false); // no bias
2042-
2043-
layer.k_b = get_tensor(string_format(TN_ATTN_K, "v", il, "bias"), false);
2044-
layer.q_b = get_tensor(string_format(TN_ATTN_Q, "v", il, "bias"), false);
2045-
layer.v_b = get_tensor(string_format(TN_ATTN_V, "v", il, "bias"), false);
2046-
layer.o_b = get_tensor(string_format(TN_ATTN_OUTPUT, "v", il, "bias"), false);
2047-
layer.ln_1_b = get_tensor(string_format(TN_LN_1, "v", il, "bias"), false);
2048-
layer.ln_2_b = get_tensor(string_format(TN_LN_2, "v", il, "bias"), false);
2130+
layer.k_w = get_tensor(string_format(TN_ATTN_K, prefix, il, "weight"));
2131+
layer.q_w = get_tensor(string_format(TN_ATTN_Q, prefix, il, "weight"));
2132+
layer.v_w = get_tensor(string_format(TN_ATTN_V, prefix, il, "weight"));
2133+
layer.o_w = get_tensor(string_format(TN_ATTN_OUTPUT, prefix, il, "weight"));
2134+
layer.k_norm = get_tensor(string_format(TN_ATTN_K_NORM, prefix, il, "weight"), false);
2135+
layer.q_norm = get_tensor(string_format(TN_ATTN_Q_NORM, prefix, il, "weight"), false);
2136+
layer.ln_1_w = get_tensor(string_format(TN_LN_1, prefix, il, "weight"), false);
2137+
layer.ln_2_w = get_tensor(string_format(TN_LN_2, prefix, il, "weight"), false);
2138+
layer.ls_1_w = get_tensor(string_format(TN_LS_1, prefix, il, "weight"), false); // no bias
2139+
layer.ls_2_w = get_tensor(string_format(TN_LS_2, prefix, il, "weight"), false); // no bias
2140+
2141+
layer.k_b = get_tensor(string_format(TN_ATTN_K, prefix, il, "bias"), false);
2142+
layer.q_b = get_tensor(string_format(TN_ATTN_Q, prefix, il, "bias"), false);
2143+
layer.v_b = get_tensor(string_format(TN_ATTN_V, prefix, il, "bias"), false);
2144+
layer.o_b = get_tensor(string_format(TN_ATTN_OUTPUT, prefix, il, "bias"), false);
2145+
layer.ln_1_b = get_tensor(string_format(TN_LN_1, prefix, il, "bias"), false);
2146+
layer.ln_2_b = get_tensor(string_format(TN_LN_2, prefix, il, "bias"), false);
20492147

20502148
// ffn
2051-
layer.ff_up_w = get_tensor(string_format(TN_FFN_UP, "v", il, "weight"));
2052-
layer.ff_up_b = get_tensor(string_format(TN_FFN_UP, "v", il, "bias"), false);
2053-
layer.ff_gate_w = get_tensor(string_format(TN_FFN_GATE, "v", il, "weight"), false);
2054-
layer.ff_gate_b = get_tensor(string_format(TN_FFN_GATE, "v", il, "bias"), false);
2055-
layer.ff_down_w = get_tensor(string_format(TN_FFN_DOWN, "v", il, "weight"));
2056-
layer.ff_down_b = get_tensor(string_format(TN_FFN_DOWN, "v", il, "bias"), false);
2149+
layer.ff_up_w = get_tensor(string_format(TN_FFN_UP, prefix, il, "weight"));
2150+
layer.ff_up_b = get_tensor(string_format(TN_FFN_UP, prefix, il, "bias"), false);
2151+
layer.ff_gate_w = get_tensor(string_format(TN_FFN_GATE, prefix, il, "weight"), false);
2152+
layer.ff_gate_b = get_tensor(string_format(TN_FFN_GATE, prefix, il, "bias"), false);
2153+
layer.ff_down_w = get_tensor(string_format(TN_FFN_DOWN, prefix, il, "weight"));
2154+
layer.ff_down_b = get_tensor(string_format(TN_FFN_DOWN, prefix, il, "bias"), false);
20572155

20582156
// some models already exported with legacy (incorrect) naming which is quite messy, let's fix it here
20592157
// note: Qwen model converted from the old surgery script has n_ff = 0, so we cannot use n_ff to check!

0 commit comments

Comments
 (0)