File tree Expand file tree Collapse file tree 2 files changed +17
-3
lines changed Expand file tree Collapse file tree 2 files changed +17
-3
lines changed Original file line number Diff line number Diff line change 3030#define KEY_LAYER_NORM_EPS " clip.%s.attention.layer_norm_epsilon"
3131
3232// vision-specific
33+ #define KEY_VISION_PROJ_TYPE " clip.vision.projector_type" // for models with mixed modalities
3334#define KEY_IMAGE_SIZE " clip.vision.image_size"
3435#define KEY_PATCH_SIZE " clip.vision.patch_size"
3536#define KEY_IMAGE_MEAN " clip.vision.image_mean"
4647#define KEY_MINICPMV_VERSION " clip.minicpmv_version"
4748
4849// audio-specific
50+ #define KEY_AUDIO_PROJ_TYPE " clip.audio.projector_type" // for models with mixed modalities
4951#define KEY_A_NUM_MEL_BINS " clip.audio.num_mel_bins"
5052#define KEY_A_PROJ_STACK_FACTOR " clip.audio.projector.stack_factor"
5153
Original file line number Diff line number Diff line change @@ -2098,15 +2098,27 @@ struct clip_model_loader {
20982098 // projector type
20992099 std::string proj_type;
21002100 {
2101+ // default key
21012102 get_string (KEY_PROJ_TYPE, proj_type, false );
2102- if (!proj_type.empty ()) {
2103- model.proj_type = clip_projector_type_from_string (proj_type);
2103+
2104+ // for models with mixed modalities
2105+ if (proj_type.empty ()) {
2106+ if (modality == CLIP_MODALITY_VISION) {
2107+ get_string (KEY_VISION_PROJ_TYPE, proj_type, false );
2108+ } else if (modality == CLIP_MODALITY_AUDIO) {
2109+ get_string (KEY_AUDIO_PROJ_TYPE, proj_type, false );
2110+ } else {
2111+ GGML_ABORT (" unknown modality" );
2112+ }
21042113 }
2114+
2115+ model.proj_type = clip_projector_type_from_string (proj_type);
2116+
21052117 if (model.proj_type == PROJECTOR_TYPE_UNKNOWN) {
21062118 throw std::runtime_error (string_format (" %s: unknown projector type: %s\n " , __func__, proj_type.c_str ()));
21072119 }
21082120
2109- // correct arch for multimodal models
2121+ // correct arch for multimodal models (legacy method)
21102122 if (model.proj_type == PROJECTOR_TYPE_QWEN25O) {
21112123 model.proj_type = modality == CLIP_MODALITY_VISION
21122124 ? PROJECTOR_TYPE_QWEN25VL
You can’t perform that action at this time.
0 commit comments