fix merge conflict

ngxson · ngxson · commit bc708b4ec5e2 · 2025-05-18T20:23:51.000+02:00
diff --git a/tools/mtmd/clip-impl.h b/tools/mtmd/clip-impl.h
@@ -102,6 +102,7 @@
 #define TN_MM_AUDIO_MLP "mm.a.mlp.%d.%s"
 #define TN_MM_NORM_PRE  "mm.a.norm_pre.%s"
 #define TN_MM_NORM_MID  "mm.a.norm_mid.%s"
+
 // align x to upper multiple of n
 #define CLIP_ALIGN(x, n) ((((x) + (n) - 1) / (n)) * (n))
 
diff --git a/tools/mtmd/clip.cpp b/tools/mtmd/clip.cpp
@@ -1736,7 +1736,7 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
             } break;
         case PROJECTOR_TYPE_ULTRAVOX:
             {
-                GGML_ASSERT(imgs.entries.size() == 1);
+                res = nullptr;
             } break;
         default:
             {
@@ -2205,6 +2205,7 @@ struct clip_model_loader {
                     vision_model.mm_2_w = get_tensor(string_format(TN_MM_AUDIO_MLP, 2, "weight"));
                     vision_model.mm_norm_pre_w = get_tensor(string_format(TN_MM_NORM_PRE, "weight"));
                     vision_model.mm_norm_mid_w = get_tensor(string_format(TN_MM_NORM_MID, "weight"));
+                } break;
             case PROJECTOR_TYPE_INTERNVL:
                 {
                     vision_model.mm_0_w = get_tensor(string_format(TN_MVLM_PROJ_MLP, 0, "weight"));
diff --git a/tools/mtmd/clip.h b/tools/mtmd/clip.h
@@ -97,3 +97,6 @@ bool clip_is_llava(const struct clip_ctx * ctx);
 bool clip_is_gemma3(const struct clip_ctx * ctx);
 
 bool clip_encode_float_image (struct clip_ctx * ctx, int n_threads, float * img, int h, int w, float * vec);
+
+// use by audio input
+void clip_image_f32_batch_add_mel(struct clip_image_f32_batch * batch, int n_mel, int n_step, float * mel);

Original file line number	Diff line number	Diff line change
`@@ -1736,7 +1736,7 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32`
`1736`	`1736`	`} break;`
`1737`	`1737`	`case PROJECTOR_TYPE_ULTRAVOX:`
`1738`	`1738`	`{`
`1739`		`- GGML_ASSERT(imgs.entries.size() == 1);`
	`1739`	`+ res = nullptr;`
`1740`	`1740`	`} break;`
`1741`	`1741`	`default:`
`1742`	`1742`	`{`
`@@ -2205,6 +2205,7 @@ struct clip_model_loader {`
`2205`	`2205`	`vision_model.mm_2_w = get_tensor(string_format(TN_MM_AUDIO_MLP, 2, "weight"));`
`2206`	`2206`	`vision_model.mm_norm_pre_w = get_tensor(string_format(TN_MM_NORM_PRE, "weight"));`
`2207`	`2207`	`vision_model.mm_norm_mid_w = get_tensor(string_format(TN_MM_NORM_MID, "weight"));`
	`2208`	`+ } break;`
`2208`	`2209`	`case PROJECTOR_TYPE_INTERNVL:`
`2209`	`2210`	`{`
`2210`	`2211`	`vision_model.mm_0_w = get_tensor(string_format(TN_MVLM_PROJ_MLP, 0, "weight"));`