warmup ok

ngxson · ngxson · commit 8b73116c5fc8 · 2025-05-04T17:20:38.000+02:00
diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py
@@ -5827,8 +5827,8 @@ def __init__(self, *args, **kwargs):
 class UltravoxAudioModel(VisionModel):
     def __init__(self, *args, **kwargs):
         super().__init__(*args, **kwargs)
-        self.hparams["image_size"] = 0
-        self.hparams["patch_size"] = 0
+        self.hparams["image_size"] = self.hparams["num_mel_bins"]
+        self.hparams["patch_size"] = self.hparams["num_mel_bins"]
         self.hparams["hidden_size"] = self.hparams["d_model"]
         self.hparams["intermediate_size"] = self.hparams["d_model"]
         self.hparams["num_attention_heads"] = self.hparams["encoder_attention_heads"]
@@ -5847,6 +5847,15 @@ def tensor_force_quant(self, name, new_name, bid, n_dims):
         if ".conv" in name:
             return gguf.GGMLQuantizationType.F16
         return False
+    
+    def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
+        del bid  # unused
+
+        if "conv1.bias" in name or "conv2.bias" in name:
+            # transpose conv1 and conv2 bias
+            data_torch = data_torch.unsqueeze(-1)
+
+        return [(self.map_tensor_name(name), data_torch)]
 
 ###### CONVERSION LOGIC ######
 
diff --git a/gguf-py/gguf/tensor_mapping.py b/gguf-py/gguf/tensor_mapping.py
@@ -1088,12 +1088,12 @@ class TensorNameMap:
             "audio_tower.conv{bid}", # ultravox
         ),
 
-        MODEL_TENSOR.A_PRE_NORM: (
+        MODEL_TENSOR.A_PRE_NORM: (),
+
+        MODEL_TENSOR.A_POST_NORM: (
             "audio_tower.layer_norm", # ultravox
         ),
 
-        MODEL_TENSOR.A_POST_NORM: (),
-
         MODEL_TENSOR.A_ENC_ATTN_Q: (
             "audio_tower.layers.{bid}.self_attn.q_proj", # ultravox
         ),
diff --git a/tools/llava/clip.cpp b/tools/llava/clip.cpp
@@ -1056,13 +1056,14 @@ static ggml_cgraph * clip_image_build_graph_ultravox(clip_ctx * ctx, const clip_
     const auto & model = ctx->vision_model;
     const auto & hparams = model.hparams;
 
-    int n_step = img.nx;
-    int n_mel  = img.ny;
+    const int n_step = img.nx;
+    const int n_mel  = img.ny;
 
     const int n_embd  = hparams.hidden_size;
     const int n_head  = hparams.n_head;
     const int d_head  = n_embd / n_head;
     const int n_layer = hparams.n_layer;
+    const int n_pos   = n_step / 2;
     const float eps   = hparams.eps;
 
     ggml_init_params params = {
@@ -1080,7 +1081,7 @@ static ggml_cgraph * clip_image_build_graph_ultravox(clip_ctx * ctx, const clip_
     ggml_set_name(inp_raw, "inp_raw");
     ggml_set_input(inp_raw);
 
-    struct ggml_tensor * positions = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_step);
+    struct ggml_tensor * positions = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_pos);
     ggml_set_name(positions, "positions");
     ggml_set_input(positions);
 
@@ -1125,20 +1126,17 @@ static ggml_cgraph * clip_image_build_graph_ultravox(clip_ctx * ctx, const clip_
             ggml_tensor * k = ggml_mul_mat(ctx0, layer.k_w, cur); // no bias for key
             ggml_tensor * v = ggml_add(ctx0, ggml_mul_mat(ctx0, layer.v_w, cur), layer.v_b);
 
-            q = ggml_reshape_3d(ctx0, q, d_head, n_head, n_step);
-            k = ggml_reshape_3d(ctx0, k, d_head, n_head, n_step);
-            v = ggml_reshape_3d(ctx0, v, d_head, n_head, n_step);
+            q = ggml_reshape_3d(ctx0, q, d_head, n_head, n_pos);
+            k = ggml_reshape_3d(ctx0, k, d_head, n_head, n_pos);
+            v = ggml_reshape_3d(ctx0, v, d_head, n_head, n_pos);
 
             q = ggml_cont(ctx0, ggml_permute(ctx0, q, 0, 2, 1, 3));
             q = ggml_scale(ctx0, q, 1.0f / std::sqrt(d_head));
-            // utils.debug_print(q, "q rope");
 
             k = ggml_cont(ctx0, ggml_permute(ctx0, k, 0, 2, 1, 3));
-            // utils.debug_print(k, "k rope");
 
             ggml_tensor * kq = ggml_mul_mat(ctx0, k, q);
             kq = ggml_soft_max_ext(ctx0, kq, nullptr, 1.0f, 0.0f);
-            // utils.debug_print(kq, "kq softmax");
 
             v = ggml_cont(ctx0, ggml_permute(ctx0, v, 1, 2, 0, 3));
 
@@ -2217,6 +2215,10 @@ struct clip_model_loader {
                 } break;
             case PROJECTOR_TYPE_ULTRAVOX:
                 {
+                    vision_model.conv1d_1_w = get_tensor(string_format(TN_CONV1D, 1, "weight"));
+                    vision_model.conv1d_1_b = get_tensor(string_format(TN_CONV1D, 1, "bias"));
+                    vision_model.conv1d_2_w = get_tensor(string_format(TN_CONV1D, 2, "weight"));
+                    vision_model.conv1d_2_b = get_tensor(string_format(TN_CONV1D, 2, "bias"));
                     vision_model.mm_1_w = get_tensor(string_format(TN_MM_AUDIO_MLP, 1, "weight"));
                     vision_model.mm_2_w = get_tensor(string_format(TN_MM_AUDIO_MLP, 2, "weight"));
                     vision_model.mm_norm_pre_w = get_tensor(string_format(TN_MM_NORM_PRE, "weight"));