@@ -1321,6 +1321,103 @@ struct clip_graph {
13211321 return gf;
13221322 }
13231323
1324+ // whisper encoder with ultravox projector
1325+ ggml_cgraph * build_ultravox () {
1326+ const int n_step = img.nx ;
1327+ const int n_pos = n_step / 2 ;
1328+
1329+ ggml_tensor * inp = build_inp_raw (1 );
1330+
1331+ ggml_tensor * positions = ggml_new_tensor_1d (ctx0, GGML_TYPE_I32, n_pos);
1332+ ggml_set_name (positions, " positions" );
1333+ ggml_set_input (positions);
1334+
1335+ // conv1d block
1336+ {
1337+ // convolution + gelu
1338+ ggml_tensor * cur = ggml_conv_1d_ph (ctx0, model.conv1d_1_w , inp, 1 , 1 );
1339+ cur = ggml_add (ctx0, cur, model.conv1d_1_b );
1340+
1341+ cur = ggml_gelu (ctx0, cur);
1342+
1343+ cur = ggml_conv_1d_ph (ctx0, model.conv1d_2_w , cur, 2 , 1 );
1344+ cur = ggml_add (ctx0, cur, model.conv1d_2_b );
1345+
1346+ cur = ggml_gelu (ctx0, cur);
1347+ // transpose
1348+ inp = ggml_cont (ctx0, ggml_transpose (ctx0, cur));
1349+ cb (inp, " after_conv1d" , -1 );
1350+ }
1351+
1352+ // sanity check (only check one layer, but it should be the same for all)
1353+ GGML_ASSERT (model.layers [0 ].ln_1_w && model.layers [0 ].ln_1_b );
1354+ GGML_ASSERT (model.layers [0 ].ln_2_w && model.layers [0 ].ln_2_b );
1355+ GGML_ASSERT (model.layers [0 ].q_b );
1356+ GGML_ASSERT (model.layers [0 ].v_b );
1357+ GGML_ASSERT (!model.layers [0 ].k_b ); // no bias for k
1358+ GGML_ASSERT (model.post_ln_w && model.post_ln_b );
1359+
1360+ ggml_tensor * pos_embd_selected = ggml_get_rows (ctx0, model.position_embeddings , positions);
1361+ ggml_tensor * cur = build_vit (
1362+ inp, n_pos,
1363+ NORM_TYPE_NORMAL,
1364+ hparams.ffn_op ,
1365+ pos_embd_selected,
1366+ nullptr );
1367+
1368+ cb (cur, " after_transformer" , -1 );
1369+
1370+ // StackAudioFrames
1371+ // https://huggingface.co/fixie-ai/ultravox-v0_5-llama-3_2-1b/blob/main/ultravox_model.py
1372+ {
1373+ int64_t stride = n_embd * hparams.proj_stack_factor ;
1374+ int64_t padded_len = GGML_PAD (ggml_nelements (cur), stride);
1375+ int64_t pad = padded_len - ggml_nelements (cur);
1376+ if (pad > 0 ) {
1377+ cur = ggml_view_1d (ctx0, cur, ggml_nelements (cur), 0 );
1378+ cur = ggml_pad (ctx0, cur, pad, 0 , 0 , 0 );
1379+ }
1380+ cur = ggml_view_2d (ctx0, cur, stride, padded_len / stride,
1381+ ggml_row_size (cur->type , stride), 0 );
1382+ }
1383+
1384+ cb (cur, " after_stacked" , -1 );
1385+
1386+ // UltravoxProjector
1387+ {
1388+ // pre-norm
1389+ cur = ggml_rms_norm (ctx0, cur, 1e-6 );
1390+ cur = ggml_mul (ctx0, cur, model.mm_norm_pre_w );
1391+
1392+ // ffn in
1393+ cur = ggml_mul_mat (ctx0, model.mm_1_w , cur);
1394+
1395+ // swiglu
1396+ {
1397+ int64_t split_point = cur->ne [0 ] / 2 ;
1398+ ggml_tensor * x0 = ggml_cont (ctx0, ggml_view_2d (ctx0, cur, split_point, cur->ne [1 ], cur->nb [1 ], 0 ));
1399+ ggml_tensor * x1 = ggml_cont (ctx0, ggml_view_2d (ctx0, cur, split_point, cur->ne [1 ], cur->nb [1 ], split_point * ggml_element_size (cur)));
1400+
1401+ // see SwiGLU in ultravox_model.py, the second half passed through is silu, not the first half
1402+ x1 = ggml_silu (ctx0, x1);
1403+ cur = ggml_mul (ctx0, x0, x1);
1404+ }
1405+
1406+ // mid-norm
1407+ cur = ggml_rms_norm (ctx0, cur, 1e-6 );
1408+ cur = ggml_mul (ctx0, cur, model.mm_norm_mid_w );
1409+
1410+ // ffn out
1411+ cur = ggml_mul_mat (ctx0, model.mm_2_w , cur);
1412+ }
1413+
1414+ cb (cur, " projected" , -1 );
1415+
1416+ ggml_build_forward_expand (gf, cur);
1417+
1418+ return gf;
1419+ }
1420+
13241421private:
13251422 //
13261423 // utility functions
@@ -1471,8 +1568,8 @@ struct clip_graph {
14711568 return inp;
14721569 }
14731570
1474- ggml_tensor * build_inp_raw () {
1475- ggml_tensor * inp_raw = ggml_new_tensor_3d (ctx0, GGML_TYPE_F32, img.nx , img.ny , 3 );
1571+ ggml_tensor * build_inp_raw (int channels = 3 ) {
1572+ ggml_tensor * inp_raw = ggml_new_tensor_3d (ctx0, GGML_TYPE_F32, img.nx , img.ny , channels );
14761573 ggml_set_name (inp_raw, " inp_raw" );
14771574 ggml_set_input (inp_raw);
14781575 return inp_raw;
@@ -1736,7 +1833,7 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
17361833 } break ;
17371834 case PROJECTOR_TYPE_ULTRAVOX:
17381835 {
1739- res = nullptr ;
1836+ res = graph. build_ultravox () ;
17401837 } break ;
17411838 default :
17421839 {
@@ -1945,6 +2042,7 @@ struct clip_model_loader {
19452042 case PROJECTOR_TYPE_ULTRAVOX:
19462043 {
19472044 get_u32 (KEY_PROJ_STACK_FACTOR, hparams.proj_stack_factor );
2045+ hparams.ffn_op = FFN_GELU;
19482046 } break ;
19492047 default :
19502048 break ;
@@ -2029,31 +2127,31 @@ struct clip_model_loader {
20292127 vision_model.layers .resize (hparams.n_layer );
20302128 for (int il = 0 ; il < hparams.n_layer ; ++il) {
20312129 auto & layer = vision_model.layers [il];
2032- layer.k_w = get_tensor (string_format (TN_ATTN_K, " v " , il, " weight" ));
2033- layer.q_w = get_tensor (string_format (TN_ATTN_Q, " v " , il, " weight" ));
2034- layer.v_w = get_tensor (string_format (TN_ATTN_V, " v " , il, " weight" ));
2035- layer.o_w = get_tensor (string_format (TN_ATTN_OUTPUT, " v " , il, " weight" ));
2036- layer.k_norm = get_tensor (string_format (TN_ATTN_K_NORM, " v " , il, " weight" ), false );
2037- layer.q_norm = get_tensor (string_format (TN_ATTN_Q_NORM, " v " , il, " weight" ), false );
2038- layer.ln_1_w = get_tensor (string_format (TN_LN_1, " v " , il, " weight" ), false );
2039- layer.ln_2_w = get_tensor (string_format (TN_LN_2, " v " , il, " weight" ), false );
2040- layer.ls_1_w = get_tensor (string_format (TN_LS_1, " v " , il, " weight" ), false ); // no bias
2041- layer.ls_2_w = get_tensor (string_format (TN_LS_2, " v " , il, " weight" ), false ); // no bias
2042-
2043- layer.k_b = get_tensor (string_format (TN_ATTN_K, " v " , il, " bias" ), false );
2044- layer.q_b = get_tensor (string_format (TN_ATTN_Q, " v " , il, " bias" ), false );
2045- layer.v_b = get_tensor (string_format (TN_ATTN_V, " v " , il, " bias" ), false );
2046- layer.o_b = get_tensor (string_format (TN_ATTN_OUTPUT, " v " , il, " bias" ), false );
2047- layer.ln_1_b = get_tensor (string_format (TN_LN_1, " v " , il, " bias" ), false );
2048- layer.ln_2_b = get_tensor (string_format (TN_LN_2, " v " , il, " bias" ), false );
2130+ layer.k_w = get_tensor (string_format (TN_ATTN_K, prefix , il, " weight" ));
2131+ layer.q_w = get_tensor (string_format (TN_ATTN_Q, prefix , il, " weight" ));
2132+ layer.v_w = get_tensor (string_format (TN_ATTN_V, prefix , il, " weight" ));
2133+ layer.o_w = get_tensor (string_format (TN_ATTN_OUTPUT, prefix , il, " weight" ));
2134+ layer.k_norm = get_tensor (string_format (TN_ATTN_K_NORM, prefix , il, " weight" ), false );
2135+ layer.q_norm = get_tensor (string_format (TN_ATTN_Q_NORM, prefix , il, " weight" ), false );
2136+ layer.ln_1_w = get_tensor (string_format (TN_LN_1, prefix , il, " weight" ), false );
2137+ layer.ln_2_w = get_tensor (string_format (TN_LN_2, prefix , il, " weight" ), false );
2138+ layer.ls_1_w = get_tensor (string_format (TN_LS_1, prefix , il, " weight" ), false ); // no bias
2139+ layer.ls_2_w = get_tensor (string_format (TN_LS_2, prefix , il, " weight" ), false ); // no bias
2140+
2141+ layer.k_b = get_tensor (string_format (TN_ATTN_K, prefix , il, " bias" ), false );
2142+ layer.q_b = get_tensor (string_format (TN_ATTN_Q, prefix , il, " bias" ), false );
2143+ layer.v_b = get_tensor (string_format (TN_ATTN_V, prefix , il, " bias" ), false );
2144+ layer.o_b = get_tensor (string_format (TN_ATTN_OUTPUT, prefix , il, " bias" ), false );
2145+ layer.ln_1_b = get_tensor (string_format (TN_LN_1, prefix , il, " bias" ), false );
2146+ layer.ln_2_b = get_tensor (string_format (TN_LN_2, prefix , il, " bias" ), false );
20492147
20502148 // ffn
2051- layer.ff_up_w = get_tensor (string_format (TN_FFN_UP, " v " , il, " weight" ));
2052- layer.ff_up_b = get_tensor (string_format (TN_FFN_UP, " v " , il, " bias" ), false );
2053- layer.ff_gate_w = get_tensor (string_format (TN_FFN_GATE, " v " , il, " weight" ), false );
2054- layer.ff_gate_b = get_tensor (string_format (TN_FFN_GATE, " v " , il, " bias" ), false );
2055- layer.ff_down_w = get_tensor (string_format (TN_FFN_DOWN, " v " , il, " weight" ));
2056- layer.ff_down_b = get_tensor (string_format (TN_FFN_DOWN, " v " , il, " bias" ), false );
2149+ layer.ff_up_w = get_tensor (string_format (TN_FFN_UP, prefix , il, " weight" ));
2150+ layer.ff_up_b = get_tensor (string_format (TN_FFN_UP, prefix , il, " bias" ), false );
2151+ layer.ff_gate_w = get_tensor (string_format (TN_FFN_GATE, prefix , il, " weight" ), false );
2152+ layer.ff_gate_b = get_tensor (string_format (TN_FFN_GATE, prefix , il, " bias" ), false );
2153+ layer.ff_down_w = get_tensor (string_format (TN_FFN_DOWN, prefix , il, " weight" ));
2154+ layer.ff_down_b = get_tensor (string_format (TN_FFN_DOWN, prefix , il, " bias" ), false );
20572155
20582156 // some models already exported with legacy (incorrect) naming which is quite messy, let's fix it here
20592157 // note: Qwen model converted from the old surgery script has n_ff = 0, so we cannot use n_ff to check!
0 commit comments