add conv layer

ngxson · ngxson · commit 49193e2af392 · 2025-04-04T11:42:19.000+02:00
diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py
@@ -5325,7 +5325,8 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter
         name = name.replace(".final_layer_norm", ".post_attention_layernorm")
 
         if "conv1.bias" in name or "conv2.bias" in name:
-            data_torch = data_torch.unsqueeze(-1).transpose(0, 1)
+            # transpose conv1 and conv2 bias
+            data_torch = data_torch.unsqueeze(-1)
 
         return [(self.map_tensor_name(name), data_torch)]
 
diff --git a/examples/asr/asr-ultravox.cpp b/examples/asr/asr-ultravox.cpp
@@ -144,7 +144,7 @@ int main(int argc, char ** argv) {
     {
         int n_ctx  = llama_model_n_ctx_train(enc_model);
         int n_embd = llama_model_n_embd(enc_model);
-        std::vector<float> embd(n_ctx * n_embd, 0.0f);
+        std::vector<float> embd(2*n_ctx * mel.n_mel, 0.0f);
         // set the input
         {
             int mel_offset = 0;
@@ -160,6 +160,7 @@ int main(int argc, char ** argv) {
         }
 
         // set the input
+        GGML_ASSERT((int)embd.size() < 2*n_ctx * n_embd);
         llama_batch batch_embd = llama_batch_init(n_ctx, n_embd, 1);
         batch_embd.n_tokens = n_ctx;
         for (int i = 0; i < batch_embd.n_tokens; i++) {
diff --git a/src/llama-model.cpp b/src/llama-model.cpp
@@ -3836,9 +3836,9 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
 
                     // conv1d
                     conv1d_1_w  = create_tensor(tn(LLM_TENSOR_WHISPER_CONV1, "weight"), {3, 128, n_embd}, 0);
-                    conv1d_1_b  = create_tensor(tn(LLM_TENSOR_WHISPER_CONV1, "bias"  ), {n_embd}, 0);
+                    conv1d_1_b  = create_tensor(tn(LLM_TENSOR_WHISPER_CONV1, "bias"  ), {1, n_embd}, 0);
                     conv1d_2_w  = create_tensor(tn(LLM_TENSOR_WHISPER_CONV2, "weight"), {3, n_embd, n_embd}, 0);
-                    conv1d_2_b  = create_tensor(tn(LLM_TENSOR_WHISPER_CONV2, "bias"  ), {n_embd}, 0);
+                    conv1d_2_b  = create_tensor(tn(LLM_TENSOR_WHISPER_CONV2, "bias"  ), {1, n_embd}, 0);
 
                     // mm projector
                     // https://huggingface.co/fixie-ai/ultravox-v0_5-llama-3_2-1b/blob/main/ultravox_model.py#L553
@@ -12132,6 +12132,25 @@ struct llm_build_ultravox_enc : public llm_graph_context {
         }
         res->add_input(std::move(inp));
 
+        if (ubatch.embd) {
+            // reshape to [2*n_ctx, n_mel]
+            cur = ggml_view_2d(ctx0, inpL, 2*hparams.n_ctx_train, hparams.n_mel_bins,
+                                ggml_row_size(inpL->type, 2*hparams.n_ctx_train), 0);
+
+            // convolution + gelu
+            cur = ggml_conv_1d_ph(ctx0, model.conv1d_1_w, cur, 1, 1);
+            cur = ggml_add(ctx0, cur, model.conv1d_1_b);
+
+            cur = ggml_gelu(ctx0, cur);
+
+            cur = ggml_conv_1d_ph(ctx0, model.conv1d_2_w, cur, 2, 1);
+            cur = ggml_add(ctx0, cur, model.conv1d_2_b);
+
+            cur = ggml_gelu(ctx0, cur);
+            // transpose
+            inpL = ggml_cont(ctx0, ggml_transpose(ctx0, cur));
+        }
+
         // add position embeddings
         ggml_tensor * pos_embd_selected = ggml_view_2d(ctx0, model.pos_embd,
                                                     model.pos_embd->ne[0], ubatch.n_tokens,

Original file line number	Diff line number	Diff line change
`@@ -144,7 +144,7 @@ int main(int argc, char ** argv) {`
`144`	`144`	`{`
`145`	`145`	`int n_ctx = llama_model_n_ctx_train(enc_model);`
`146`	`146`	`int n_embd = llama_model_n_embd(enc_model);`
`147`		`- std::vector<float> embd(n_ctx * n_embd, 0.0f);`
	`147`	`+ std::vector<float> embd(2n_ctx mel.n_mel, 0.0f);`
`148`	`148`	`// set the input`
`149`	`149`	`{`
`150`	`150`	`int mel_offset = 0;`
`@@ -160,6 +160,7 @@ int main(int argc, char ** argv) {`
`160`	`160`	`}`
`161`	`161`
`162`	`162`	`// set the input`
	`163`	`+ GGML_ASSERT((int)embd.size() < 2n_ctx n_embd);`
`163`	`164`	`llama_batch batch_embd = llama_batch_init(n_ctx, n_embd, 1);`
`164`	`165`	`batch_embd.n_tokens = n_ctx;`
`165`	`166`	`for (int i = 0; i < batch_embd.n_tokens; i++) {`