various fixes

ngxson · ngxson · commit cf4f5d23aae3 · 2025-05-19T23:47:24.000+02:00
diff --git a/.editorconfig b/.editorconfig
@@ -48,3 +48,7 @@ end_of_line = unset
 charset = unset
 trim_trailing_whitespace = unset
 insert_final_newline = unset
+
+[tools/mtmd/miniaudio.h]
+trim_trailing_whitespace = unset
+insert_final_newline = unset
diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py
@@ -1119,6 +1119,8 @@ class VisionModel(ModelBase):
     model_arch = gguf.MODEL_ARCH.CLIP_VISION
     preprocessor_config: dict[str, Any]
     global_config: dict[str, Any]
+    has_vision_encoder: bool = True
+    has_audio_encoder: bool = False
 
     def __init__(self, *args, **kwargs):
         super().__init__(*args, **kwargs)
@@ -1159,7 +1161,10 @@ def set_type(self):
     def set_gguf_parameters(self):
         self.gguf_writer.add_file_type(self.ftype)
         self.gguf_writer.add_vision_projection_dim(self.n_embd_text)
-        self.gguf_writer.add_vision_has_vision_encoder(True)
+        if self.has_vision_encoder:
+            self.gguf_writer.add_vision_has_vision_encoder(True)
+        if self.has_audio_encoder:
+            self.gguf_writer.add_vision_has_audio_encoder(True)
 
         # vision config
         self.gguf_writer.add_vision_image_size(self.find_hparam(["image_size"]))
@@ -5969,6 +5974,7 @@ def _reverse_hf_permute(data_torch, n_heads, hidden_dim):
 @ModelBase.register("UltravoxModel")
 class UltravoxModel(TextModel):
     model_arch = gguf.MODEL_ARCH.LLAMA # dummy
+
     def __init__(self, *args, **kwargs):
         super().__init__(*args, **kwargs)
         raise NotImplementedError("Ultravox does not have text decoder. Please use --mmproj argument")
@@ -5978,6 +5984,8 @@ def __init__(self, *args, **kwargs):
 class UltravoxAudioModel(VisionModel):
     def __init__(self, *args, **kwargs):
         super().__init__(*args, **kwargs)
+        self.has_vision_encoder = False
+        self.has_audio_encoder = True
         self.hparams["image_size"] = self.hparams["num_mel_bins"]
         self.hparams["patch_size"] = self.hparams["num_mel_bins"]
         self.hparams["hidden_size"] = self.hparams["d_model"]
@@ -5988,7 +5996,6 @@ def __init__(self, *args, **kwargs):
 
     def set_gguf_parameters(self):
         super().set_gguf_parameters()
-        self.gguf_writer.add_bool(gguf.Keys.ClipVision.HAS_AUDIO_ENC, True)
         self.gguf_writer.add_vision_projector_type(gguf.VisionProjectorType.ULTRAVOX)
         self.gguf_writer.add_vision_attention_layernorm_eps(self.hparams.get("layer_norm_eps", 1e-5))
         self.gguf_writer.add_uint32(gguf.Keys.ClipVision.Projector.STACK_FACTOR, self.global_config["stack_factor"])
@@ -5998,7 +6005,7 @@ def tensor_force_quant(self, name, new_name, bid, n_dims):
         if ".conv" in name and ".weight" in name:
             return gguf.GGMLQuantizationType.F16
         return False
-    
+
     def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
         del bid  # unused
 
diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py
@@ -220,9 +220,9 @@ class Adapter:
         LORA_ALPHA = "adapter.lora.alpha"
 
     class ClipVision:
-        HAS_AUDIO_ENC       = "clip.has_audio_encoder"
         PROJECTOR_TYPE      = "clip.projector_type"
         HAS_VISION_ENCODER  = "clip.has_vision_encoder"
+        HAS_AUDIO_ENCODER   = "clip.has_audio_encoder"
         HAS_LLAVA_PROJECTOR = "clip.has_llava_projector"
         IMAGE_SIZE          = "clip.vision.image_size"
         PATCH_SIZE          = "clip.vision.patch_size"
diff --git a/gguf-py/gguf/gguf_writer.py b/gguf-py/gguf/gguf_writer.py
@@ -942,6 +942,9 @@ def add_vision_projection_dim(self, value: int) -> None:
     def add_vision_has_vision_encoder(self, value: bool) -> None:
         self.add_bool(Keys.ClipVision.HAS_VISION_ENCODER, value)
 
+    def add_vision_has_audio_encoder(self, value: bool) -> None:
+        self.add_bool(Keys.ClipVision.HAS_AUDIO_ENCODER, value)
+
     def add_vision_patch_size(self, value: int) -> None:
         self.add_uint32(Keys.ClipVision.PATCH_SIZE, value)
 
diff --git a/tools/mtmd/clip-impl.h b/tools/mtmd/clip-impl.h
@@ -17,6 +17,7 @@
 #define KEY_NAME                "general.name"
 #define KEY_DESCRIPTION         "general.description"
 #define KEY_HAS_AUDIO_ENC       "clip.has_audio_encoder"
+#define KEY_HAS_VISION_ENC      "clip.has_vision_encoder"
 #define KEY_MINICPMV_VERSION    "clip.minicpmv_version"
 #define KEY_USE_GELU            "clip.use_gelu"
 #define KEY_USE_SILU            "clip.use_silu"
diff --git a/tools/mtmd/clip.cpp b/tools/mtmd/clip.cpp
@@ -165,6 +165,7 @@ enum patch_merge_type {
 };
 
 struct clip_hparams {
+    bool has_vision = false;
     bool has_audio = false;
 
     int32_t image_size;
@@ -2029,15 +2030,16 @@ struct clip_model_loader {
         {
             get_i32(KEY_MINICPMV_VERSION, ctx_clip.minicpmv_version, false); // legacy
 
-            get_bool(KEY_HAS_AUDIO_ENC, hparams.has_audio, false);
-            get_u32(KEY_N_EMBD,         hparams.n_embd);
-            get_u32(KEY_N_HEAD,         hparams.n_head);
-            get_u32(KEY_N_FF,           hparams.n_ff);
-            get_u32(KEY_N_BLOCK,        hparams.n_layer);
-            get_u32(KEY_PROJ_DIM,       hparams.projection_dim);
-            get_f32(KEY_LAYER_NORM_EPS, hparams.eps);
-            get_u32(KEY_IMAGE_SIZE,     hparams.image_size);
-            get_u32(KEY_PATCH_SIZE,     hparams.patch_size);
+            get_bool(KEY_HAS_AUDIO_ENC,  hparams.has_audio, false);
+            get_bool(KEY_HAS_VISION_ENC, hparams.has_vision, false);
+            get_u32(KEY_N_EMBD,          hparams.n_embd);
+            get_u32(KEY_N_HEAD,          hparams.n_head);
+            get_u32(KEY_N_FF,            hparams.n_ff);
+            get_u32(KEY_N_BLOCK,         hparams.n_layer);
+            get_u32(KEY_PROJ_DIM,        hparams.projection_dim);
+            get_f32(KEY_LAYER_NORM_EPS,  hparams.eps);
+            get_u32(KEY_IMAGE_SIZE,      hparams.image_size);
+            get_u32(KEY_PATCH_SIZE,      hparams.patch_size);
             get_u32(KEY_IMAGE_CROP_RESOLUTION,    hparams.image_crop_resolution, false);
             get_arr_int(KEY_IMAGE_GRID_PINPOINTS, hparams.image_grid_pinpoints, false);
 
@@ -2173,6 +2175,7 @@ struct clip_model_loader {
             }
 
             LOG_INF("%s: projector:          %s\n", __func__, proj_type.c_str());
+            LOG_INF("%s: has_vision_encoder: %d\n", __func__, hparams.has_vision);
             LOG_INF("%s: has_audio_encoder:  %d\n", __func__, hparams.has_audio);
             LOG_INF("%s: n_embd:             %d\n", __func__, hparams.n_embd);
             LOG_INF("%s: n_head:             %d\n", __func__, hparams.n_head);
@@ -3953,6 +3956,14 @@ bool clip_is_gemma3(const struct clip_ctx * ctx) {
     return ctx->proj_type == PROJECTOR_TYPE_GEMMA3;
 }
 
+bool clip_has_vision_encoder(const struct clip_ctx * ctx) {
+    return ctx->vision_model.hparams.has_vision;
+}
+
+bool clip_has_audio_encoder(const struct clip_ctx * ctx) {
+    return ctx->vision_model.hparams.has_audio;
+}
+
 bool clip_encode_float_image (struct clip_ctx * ctx, int n_threads, float * img, int h, int w, float * vec) {
     clip_image_f32 clip_img;
     clip_img.buf.resize(h * w * 3);
diff --git a/tools/mtmd/clip.h b/tools/mtmd/clip.h
@@ -96,3 +96,6 @@ bool clip_encode_float_image (struct clip_ctx * ctx, int n_threads, float * img,
 
 // use by audio input
 void clip_image_f32_batch_add_mel(struct clip_image_f32_batch * batch, int n_mel, int n_step, float * mel);
+
+bool clip_has_vision_encoder(const struct clip_ctx * ctx);
+bool clip_has_audio_encoder(const struct clip_ctx * ctx);
diff --git a/tools/mtmd/mtmd-audio.cpp b/tools/mtmd/mtmd-audio.cpp
@@ -29,6 +29,9 @@
 
 // most of the code here is copied from whisper.cpp
 
+// align x to upper multiple of n
+#define _ALIGN(x, n) ((((x) + (n) - 1) / (n)) * (n))
+
 namespace whisper_preprocessor {
 
 #define SIN_COS_N_COUNT WHISPER_N_FFT
@@ -298,9 +301,17 @@ bool preprocess_audio(
         size_t n_samples,
         whisper_filters & filters,
         whisper_mel & output) {
+    
+    // a bit hacky, but we want to align the output to a multiple of WHISPER_N_FFT * proj_stack_factor
+    // proj_stack_factor is 8, specifically for Ultravox (so this is a temporary solution)
+
+    size_t n_padded = _ALIGN(n_samples, WHISPER_N_FFT * 8);
+    std::vector<float> samples_padded(n_padded, 0.0f);
+    std::copy(samples, samples + n_samples, samples_padded.data());
+
     return log_mel_spectrogram(
-                samples,
-                n_samples,
+                samples_padded.data(),
+                samples_padded.size(),
                 COMMON_SAMPLE_RATE,
                 WHISPER_N_FFT,
                 WHISPER_HOP_LENGTH,
@@ -391,7 +402,7 @@ bool read_wav_from_buf(const unsigned char * buf_in, size_t len, int target_samp
             ma_decoder_uninit(&decoder);
             return false;
         }
-        
+
         double resample_ratio = (double)target_sampler_rate / decoder.outputSampleRate;
         // Reserve for mono output
         pcmf32_mono.reserve(static_cast<size_t>(total_frames_expected_from_decoder * resample_ratio * 1.1) + 1);
@@ -411,9 +422,9 @@ bool read_wav_from_buf(const unsigned char * buf_in, size_t len, int target_samp
             }
 
             if (frames_decoded_this_iteration == 0 && result == MA_AT_END) { // Ensure we process the last bit if MA_AT_END was from previous read
-                break; 
+                break;
             }
-            
+
             ma_uint64 frame_count_in = frames_decoded_this_iteration;
             ma_uint64 frame_count_out_capacity;
 
@@ -423,7 +434,7 @@ bool read_wav_from_buf(const unsigned char * buf_in, size_t len, int target_samp
                 ma_decoder_uninit(&decoder);
                 return false;
             }
-            
+
             size_t current_pcmf32_sample_offset = pcmf32_mono.size();
             // Resize for mono output (channelsOut is 1)
             pcmf32_mono.resize(current_pcmf32_sample_offset + frame_count_out_capacity * data_converter.channelsOut);
@@ -433,7 +444,7 @@ bool read_wav_from_buf(const unsigned char * buf_in, size_t len, int target_samp
             result = ma_data_converter_process_pcm_frames(
                 &data_converter,
                 temp_decode_buffer.data(),
-                &frame_count_in, 
+                &frame_count_in,
                 pcmf32_mono.data() + current_pcmf32_sample_offset,
                 &frames_actually_output
             );
@@ -443,7 +454,7 @@ bool read_wav_from_buf(const unsigned char * buf_in, size_t len, int target_samp
                 ma_decoder_uninit(&decoder);
                 return false;
             }
-            
+
             // Adjust size to actual frames output (mono)
             pcmf32_mono.resize(current_pcmf32_sample_offset + frames_actually_output * data_converter.channelsOut);
 
diff --git a/tools/mtmd/mtmd.cpp b/tools/mtmd/mtmd.cpp
@@ -97,6 +97,8 @@ struct mtmd_context {
     bool print_timings;
     int n_threads;
     std::string image_marker;
+    bool has_vision;
+    bool has_audio;
 
     // for llava-uhd style models, we need special tokens in-between slices
     // minicpmv calls them "slices", llama 4 calls them "tiles"
@@ -135,7 +137,9 @@ struct mtmd_context {
             throw std::runtime_error(string_format("Failed to load CLIP model from %s\n", mmproj_fname));
         }
 
-        use_mrope = clip_is_qwen2vl(ctx_clip);
+        has_vision = clip_has_vision_encoder(ctx_clip);
+        has_audio  = clip_has_audio_encoder(ctx_clip);
+        use_mrope  = clip_is_qwen2vl(ctx_clip);
 
         projector_type proj = clip_get_projector_type(ctx_clip);
         int minicpmv_version = clip_is_minicpmv(ctx_clip);
@@ -362,15 +366,24 @@ int32_t mtmd_tokenize(mtmd_context * ctx,
         output->entries.emplace_back(std::move(chunk));
 
         // only add image/audio tokens to middle of 2 parts
-        bool is_not_last = &parts.back() != &part;
+        // therefore, we skip handling image/audio if this is the last part
+        if (&parts.back() == &part) {
+            continue;
+        }
+
+        if (!bitmaps[i_bm]->is_audio) {
+            // handle image
 
-        // handle image
-        if (is_not_last && !bitmaps[i_bm]->is_audio) {
             if (i_bm >= n_bitmaps) {
                 LOG_ERR("%s: error: not enough images for %d parts\n", __func__, (int)parts.size());
                 return 1;
             }
 
+            if (!ctx->has_vision) {
+                LOG_ERR("%s: error: model does not support vision input\n", __func__);
+                return 2;
+            }
+
             // convert mtmd_bitmap to clip_image_u8
             clip_image_u8_ptr img_u8(clip_image_u8_init());
             img_u8->nx = bitmaps[i_bm]->nx;
@@ -486,15 +499,20 @@ int32_t mtmd_tokenize(mtmd_context * ctx,
 
             i_bm++; // move to next image
             continue;
-        }
-    
-        // handle audio
-        if (is_not_last && bitmaps[i_bm]->is_audio) {
+
+        } else {
+            // handle audio
+
             if (i_bm >= n_bitmaps) {
                 LOG_ERR("%s: error: not enough images for %d parts\n", __func__, (int)parts.size());
                 return 1;
             }
 
+            if (!ctx->has_audio) {
+                LOG_ERR("%s: error: model does not support audio input\n", __func__);
+                return 2;
+            }
+
             // preprocess audio
             whisper_preprocessor::whisper_mel mel_spec;
             GGML_ASSERT(ctx->w_filters.n_mel);
@@ -506,9 +524,11 @@ int32_t mtmd_tokenize(mtmd_context * ctx,
                 return 2;
             }
 
-            // DEBUG!!!!!!!!!!
-            printf("mel_spec.n_len = %d\n", mel_spec.n_len);
-            printf("mel_spec.n_mel = %d\n", mel_spec.n_mel);
+            // DEBUG!!!
+            // mel_spec.data.resize(220*8*2 * mel_spec.n_mel);
+            // mel_spec.n_len = 220*8*2;
+            LOG_DBG("mel_spec.n_len = %d\n", mel_spec.n_len);
+            LOG_DBG("mel_spec.n_mel = %d\n", mel_spec.n_mel);
 
             // convert mel spectrogram to clip_image_f32_batch
             clip_image_f32_ptr mel_f32(clip_image_f32_init());
@@ -526,6 +546,8 @@ int32_t mtmd_tokenize(mtmd_context * ctx,
             audio_tokens->batch_f32 = std::move(batch_f32);
             audio_tokens->id = bitmaps[i_bm]->id; // optional
 
+            LOG_DBG("audio_tokens->n_tokens = %d\n", audio_tokens->n_tokens);
+
             mtmd_input_chunk chunk{
                 MTMD_INPUT_CHUNK_TYPE_AUDIO,
                 {}, // text tokens
@@ -606,6 +628,14 @@ bool mtmd_decode_use_mrope(mtmd_context * ctx) {
     return ctx->use_mrope;
 }
 
+bool mtmd_support_vision(mtmd_context * ctx) {
+    return ctx->has_vision;
+}
+
+bool mtmd_support_audio(mtmd_context * ctx) {
+    return ctx->has_audio;
+}
+
 // these 2 helpers below use internal clip_image_u8_ptr,
 // so unfortunately they cannot moved to mtmd-helper.h
 // however, in theory, user can decode image file to bitmap using
diff --git a/tools/mtmd/mtmd.h b/tools/mtmd/mtmd.h
@@ -99,6 +99,11 @@ MTMD_API bool mtmd_decode_use_non_causal(mtmd_context * ctx);
 // whether the current model use M-RoPE for llama_decode
 MTMD_API bool mtmd_decode_use_mrope(mtmd_context * ctx);
 
+// whether the current model supports vision input
+MTMD_API bool mtmd_support_vision(mtmd_context * ctx);
+
+// whether the current model supports audio input
+MTMD_API bool mtmd_support_audio(mtmd_context * ctx);
 
 // mtmd_bitmap
 //