Skip to content

Commit d657fa3

Browse files
committed
mtmd : support home-cooked Mistral Small Omni
1 parent 8ad7b3e commit d657fa3

File tree

2 files changed

+17
-3
lines changed

2 files changed

+17
-3
lines changed

tools/mtmd/clip-impl.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,7 @@
3030
#define KEY_LAYER_NORM_EPS "clip.%s.attention.layer_norm_epsilon"
3131

3232
// vision-specific
33+
#define KEY_VISION_PROJ_TYPE "clip.vision.projector_type" // for models with mixed modalities
3334
#define KEY_IMAGE_SIZE "clip.vision.image_size"
3435
#define KEY_PATCH_SIZE "clip.vision.patch_size"
3536
#define KEY_IMAGE_MEAN "clip.vision.image_mean"
@@ -46,6 +47,7 @@
4647
#define KEY_MINICPMV_VERSION "clip.minicpmv_version"
4748

4849
// audio-specific
50+
#define KEY_AUDIO_PROJ_TYPE "clip.audio.projector_type" // for models with mixed modalities
4951
#define KEY_A_NUM_MEL_BINS "clip.audio.num_mel_bins"
5052
#define KEY_A_PROJ_STACK_FACTOR "clip.audio.projector.stack_factor"
5153

tools/mtmd/clip.cpp

Lines changed: 15 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -2098,15 +2098,27 @@ struct clip_model_loader {
20982098
// projector type
20992099
std::string proj_type;
21002100
{
2101+
// default key
21012102
get_string(KEY_PROJ_TYPE, proj_type, false);
2102-
if (!proj_type.empty()) {
2103-
model.proj_type = clip_projector_type_from_string(proj_type);
2103+
2104+
// for models with mixed modalities
2105+
if (proj_type.empty()) {
2106+
if (modality == CLIP_MODALITY_VISION) {
2107+
get_string(KEY_VISION_PROJ_TYPE, proj_type, false);
2108+
} else if (modality == CLIP_MODALITY_AUDIO) {
2109+
get_string(KEY_AUDIO_PROJ_TYPE, proj_type, false);
2110+
} else {
2111+
GGML_ABORT("unknown modality");
2112+
}
21042113
}
2114+
2115+
model.proj_type = clip_projector_type_from_string(proj_type);
2116+
21052117
if (model.proj_type == PROJECTOR_TYPE_UNKNOWN) {
21062118
throw std::runtime_error(string_format("%s: unknown projector type: %s\n", __func__, proj_type.c_str()));
21072119
}
21082120

2109-
// correct arch for multimodal models
2121+
// correct arch for multimodal models (legacy method)
21102122
if (model.proj_type == PROJECTOR_TYPE_QWEN25O) {
21112123
model.proj_type = modality == CLIP_MODALITY_VISION
21122124
? PROJECTOR_TYPE_QWEN25VL

0 commit comments

Comments
 (0)