@@ -1087,13 +1087,6 @@ static ggml_cgraph * clip_image_build_graph_ultravox(clip_ctx * ctx, const clip_
10871087
10881088 ggml_tensor * inp;
10891089
1090- auto layer_norm = [&](ggml_tensor * x, ggml_tensor * w, ggml_tensor * b) {
1091- x = ggml_norm (ctx0, x, eps);
1092- x = ggml_mul (ctx0, x, w);
1093- x = ggml_add (ctx0, x, b);
1094- return x;
1095- };
1096-
10971090 // conv1d block
10981091 {
10991092 // convolution + gelu
@@ -1118,7 +1111,8 @@ static ggml_cgraph * clip_image_build_graph_ultravox(clip_ctx * ctx, const clip_
11181111 auto & layer = model.layers [il];
11191112 ggml_tensor * cur = inp;
11201113
1121- cur = layer_norm (cur, layer.ln_1_w , layer.ln_1_b );
1114+ cur = ggml_norm (ctx0, cur, eps);
1115+ cur = ggml_add (ctx0, ggml_mul (ctx0, cur, layer.ln_1_w ), layer.ln_1_b );
11221116
11231117 // attention
11241118 {
@@ -1131,19 +1125,17 @@ static ggml_cgraph * clip_image_build_graph_ultravox(clip_ctx * ctx, const clip_
11311125 v = ggml_reshape_3d (ctx0, v, d_head, n_head, n_pos);
11321126
11331127 q = ggml_cont (ctx0, ggml_permute (ctx0, q, 0 , 2 , 1 , 3 ));
1134- q = ggml_scale (ctx0, q, 1 .0f / std::sqrt (d_head));
1135-
11361128 k = ggml_cont (ctx0, ggml_permute (ctx0, k, 0 , 2 , 1 , 3 ));
11371129
11381130 ggml_tensor * kq = ggml_mul_mat (ctx0, k, q);
1139- kq = ggml_soft_max_ext (ctx0, kq, nullptr , 1 .0f , 0 .0f );
1131+ kq = ggml_soft_max_ext (ctx0, kq, nullptr , 1 .0f / std::sqrt (d_head) , 0 .0f );
11401132
11411133 v = ggml_cont (ctx0, ggml_permute (ctx0, v, 1 , 2 , 0 , 3 ));
11421134
11431135 ggml_tensor * kqv = ggml_mul_mat (ctx0, v, kq);
1144- // kqv = ggml_reshape_3d(ctx0, kqv, d_head, n_tokens , n_head);
1136+ // kqv = ggml_reshape_3d(ctx0, kqv, d_head, n_pos , n_head);
11451137 kqv = ggml_permute (ctx0, kqv, 0 , 2 , 1 , 3 );
1146- kqv = ggml_cont_2d (ctx0, kqv, n_embd, d_head );
1138+ kqv = ggml_cont_2d (ctx0, kqv, n_embd, n_pos );
11471139
11481140 cur = ggml_add (ctx0, ggml_mul_mat (ctx0, layer.o_w , kqv), layer.o_b );
11491141 }
@@ -1152,7 +1144,8 @@ static ggml_cgraph * clip_image_build_graph_ultravox(clip_ctx * ctx, const clip_
11521144 cur = ggml_add (ctx0, cur, inp);
11531145
11541146 inp = cur; // inp = residual, cur = hidden_states
1155- cur = layer_norm (cur, layer.ln_2_w , layer.ln_2_b );
1147+ cur = ggml_norm (ctx0, cur, eps);
1148+ cur = ggml_add (ctx0, ggml_mul (ctx0, cur, layer.ln_2_w ), layer.ln_2_b );
11561149
11571150 // mlp
11581151 {
@@ -1170,10 +1163,11 @@ static ggml_cgraph * clip_image_build_graph_ultravox(clip_ctx * ctx, const clip_
11701163 ggml_tensor * embeddings = inp;
11711164
11721165 // output norm
1173- embeddings = layer_norm (embeddings, model.post_ln_w , model.post_ln_b );
1166+ embeddings = ggml_norm (ctx0, embeddings, eps);
1167+ embeddings = ggml_add (ctx0, ggml_mul (ctx0, embeddings, model.post_ln_w ), model.post_ln_b );
11741168
1175- // pad and stack
1176- // https://huggingface.co/fixie-ai/ultravox-v0_5-llama-3_2-1b/blob/main/ultravox_model.py#L520
1169+ // StackAudioFrames
1170+ // https://huggingface.co/fixie-ai/ultravox-v0_5-llama-3_2-1b/blob/main/ultravox_model.py
11771171 {
11781172 int64_t stride = n_embd * hparams.proj_stack_factor ;
11791173 int64_t padded_len = GGML_PAD (ggml_nelements (embeddings), stride);
@@ -1186,11 +1180,11 @@ static ggml_cgraph * clip_image_build_graph_ultravox(clip_ctx * ctx, const clip_
11861180 ggml_row_size (embeddings->type , stride), 0 );
11871181 }
11881182
1189- // projection
1183+ // UltravoxProjector
11901184 {
11911185 ggml_tensor * cur = embeddings;
11921186 // pre-norm
1193- cur = ggml_rms_norm (ctx0, cur, eps );
1187+ cur = ggml_rms_norm (ctx0, cur, 1e-6 );
11941188 cur = ggml_mul (ctx0, cur, model.mm_norm_pre_w );
11951189
11961190 // ffn in
@@ -1202,12 +1196,13 @@ static ggml_cgraph * clip_image_build_graph_ultravox(clip_ctx * ctx, const clip_
12021196 ggml_tensor * x0 = ggml_cont (ctx0, ggml_view_2d (ctx0, cur, split_point, cur->ne [1 ], cur->nb [1 ], 0 ));
12031197 ggml_tensor * x1 = ggml_cont (ctx0, ggml_view_2d (ctx0, cur, split_point, cur->ne [1 ], cur->nb [1 ], split_point * ggml_element_size (cur)));
12041198
1205- x0 = ggml_silu (ctx0, x0);
1199+ // see SwiGLU in ultravox_model.py, the second half passed through is silu, not the first half
1200+ x1 = ggml_silu (ctx0, x1);
12061201 cur = ggml_mul (ctx0, x0, x1);
12071202 }
12081203
12091204 // mid-norm
1210- cur = ggml_rms_norm (ctx0, cur, eps );
1205+ cur = ggml_rms_norm (ctx0, cur, 1e-6 );
12111206 cur = ggml_mul (ctx0, cur, model.mm_norm_mid_w );
12121207
12131208 // ffn out
@@ -1216,6 +1211,11 @@ static ggml_cgraph * clip_image_build_graph_ultravox(clip_ctx * ctx, const clip_
12161211 embeddings = cur;
12171212 }
12181213
1214+ embeddings = ggml_view_2d (ctx0, embeddings, 2048 , n_step / 16 ,
1215+ ggml_row_size (embeddings->type , 2048 ), 0 );
1216+
1217+ printf (" shape of embd: %lld %lld %lld\n " , embeddings->ne [0 ], embeddings->ne [1 ], embeddings->ne [2 ]);
1218+
12191219 // build the graph
12201220 ggml_build_forward_expand (gf, embeddings);
12211221
@@ -3350,7 +3350,7 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
33503350 };
33513351
33523352 // set input pixel values
3353- {
3353+ if (!imgs. is_audio ) {
33543354 size_t nelem = 0 ;
33553355 for (const auto & img : imgs.entries ) {
33563356 nelem += img->nx * img->ny * 3 ;
@@ -3387,6 +3387,16 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
33873387 }
33883388 }
33893389 set_input_f32 (" inp_raw" , inp_raw);
3390+
3391+ } else {
3392+ // audio input
3393+ GGML_ASSERT (imgs.entries .size () == 1 );
3394+ const auto & mel_inp = imgs.entries [0 ]; // 3 channels, but only use one
3395+ const int n_step = mel_inp->nx ;
3396+ const int n_mel = mel_inp->ny ;
3397+ std::vector<float > inp_raw (n_step * n_mel);
3398+ std::memcpy (inp_raw.data (), mel_inp->buf .data (), n_step * n_mel * sizeof (float ));
3399+ set_input_f32 (" inp_raw" , inp_raw);
33903400 }
33913401
33923402 // set input per projector
@@ -3585,6 +3595,16 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
35853595 {
35863596 // do nothing
35873597 } break ;
3598+ case PROJECTOR_TYPE_ULTRAVOX:
3599+ {
3600+ const auto & mel_inp = imgs.entries [0 ];
3601+ const int n_pos = mel_inp->nx / 2 ;
3602+ std::vector<int32_t > positions (n_pos);
3603+ for (int i = 0 ; i < n_pos; i++) {
3604+ positions[i] = i;
3605+ }
3606+ set_input_i32 (" positions" , positions);
3607+ } break ;
35883608 default :
35893609 GGML_ABORT (" Unknown projector type" );
35903610 }
@@ -3820,3 +3840,14 @@ bool clip_encode_float_image (struct clip_ctx * ctx, int n_threads, float * img,
38203840projector_type clip_get_projector_type (const struct clip_ctx * ctx) {
38213841 return ctx->proj_type ;
38223842}
3843+
3844+ void clip_image_f32_batch_add_mel (struct clip_image_f32_batch * batch, int n_mel, int n_step, float * mel) {
3845+ clip_image_f32 * audio = new clip_image_f32;
3846+ audio->nx = n_step;
3847+ audio->ny = n_mel;
3848+ audio->buf .resize (n_step * n_mel);
3849+ std::memcpy (audio->buf .data (), mel, n_step * n_mel * sizeof (float ));
3850+
3851+ batch->entries .push_back (clip_image_f32_ptr (audio));
3852+ batch->is_audio = true ;
3853+ }
0 commit comments