@@ -3836,9 +3836,9 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
38363836
38373837 // conv1d
38383838 conv1d_1_w = create_tensor(tn(LLM_TENSOR_WHISPER_CONV1, "weight"), {3, 128, n_embd}, 0);
3839- conv1d_1_b = create_tensor(tn(LLM_TENSOR_WHISPER_CONV1, "bias" ), {n_embd}, 0);
3839+ conv1d_1_b = create_tensor(tn(LLM_TENSOR_WHISPER_CONV1, "bias" ), {1, n_embd}, 0);
38403840 conv1d_2_w = create_tensor(tn(LLM_TENSOR_WHISPER_CONV2, "weight"), {3, n_embd, n_embd}, 0);
3841- conv1d_2_b = create_tensor(tn(LLM_TENSOR_WHISPER_CONV2, "bias" ), {n_embd}, 0);
3841+ conv1d_2_b = create_tensor(tn(LLM_TENSOR_WHISPER_CONV2, "bias" ), {1, n_embd}, 0);
38423842
38433843 // mm projector
38443844 // https://huggingface.co/fixie-ai/ultravox-v0_5-llama-3_2-1b/blob/main/ultravox_model.py#L553
@@ -12132,6 +12132,25 @@ struct llm_build_ultravox_enc : public llm_graph_context {
1213212132 }
1213312133 res->add_input(std::move(inp));
1213412134
12135+ if (ubatch.embd) {
12136+ // reshape to [2*n_ctx, n_mel]
12137+ cur = ggml_view_2d(ctx0, inpL, 2*hparams.n_ctx_train, hparams.n_mel_bins,
12138+ ggml_row_size(inpL->type, 2*hparams.n_ctx_train), 0);
12139+
12140+ // convolution + gelu
12141+ cur = ggml_conv_1d_ph(ctx0, model.conv1d_1_w, cur, 1, 1);
12142+ cur = ggml_add(ctx0, cur, model.conv1d_1_b);
12143+
12144+ cur = ggml_gelu(ctx0, cur);
12145+
12146+ cur = ggml_conv_1d_ph(ctx0, model.conv1d_2_w, cur, 2, 1);
12147+ cur = ggml_add(ctx0, cur, model.conv1d_2_b);
12148+
12149+ cur = ggml_gelu(ctx0, cur);
12150+ // transpose
12151+ inpL = ggml_cont(ctx0, ggml_transpose(ctx0, cur));
12152+ }
12153+
1213512154 // add position embeddings
1213612155 ggml_tensor * pos_embd_selected = ggml_view_2d(ctx0, model.pos_embd,
1213712156 model.pos_embd->ne[0], ubatch.n_tokens,
0 commit comments