added voxtral support, however without the magic token it hears audio as text

LostRuins · LostRuins · commit 12a6088a6540 · 2025-07-28T22:35:59.000+08:00
diff --git a/gpttype_adapter.cpp b/gpttype_adapter.cpp
@@ -2440,19 +2440,30 @@ ModelLoadResult gpttype_load_model(const load_model_inputs inputs, FileFormat in
                 fprintf(stderr, "%s: error: failed to load mmproj model!\n", __func__);
                 return ModelLoadResult::FAIL;
             }
-            const int n_embd_clip = clip_n_mmproj_embd(clp_ctx_v);
             const int n_embd_llm  = llama_n_embd(llamamodel);
-            if (clp_ctx_v && clp_ctx_a) {
-                int n_embd_a = clip_n_mmproj_embd(clp_ctx_a);
-                if (n_embd_clip != n_embd_a) {
-                    fprintf(stderr, "%s: mmproj embedding mismatch between Audio and Vision (%d and %d)! Make sure you use the correct mmproj file!\n", __func__,n_embd_clip, n_embd_a);
+            int n_embd_clip_a = -1;
+            int n_embd_clip_v = -1;
+            if (clp_ctx_v)
+            {
+                n_embd_clip_v = clip_n_mmproj_embd(clp_ctx_v);
+                if (n_embd_clip_v != n_embd_llm) {
+                    fprintf(stderr, "%s: mmproj vision embedding mismatch (%d and %d)! Make sure you use the correct mmproj file!\n", __func__,n_embd_clip_v, n_embd_llm);
+                    return ModelLoadResult::FAIL;
+                }
+            }
+            if (clp_ctx_a)
+            {
+                n_embd_clip_a = clip_n_mmproj_embd(clp_ctx_a);
+                if (n_embd_clip_a != n_embd_llm) {
+                    fprintf(stderr, "%s: mmproj audio embedding mismatch (%d and %d)! Make sure you use the correct mmproj file!\n", __func__,n_embd_clip_a, n_embd_llm);
                     return ModelLoadResult::FAIL;
                 }
             }
-            if (n_embd_clip != n_embd_llm) {
-                fprintf(stderr, "%s: mmproj embedding mismatch (%d and %d)! Make sure you use the correct mmproj file!\n", __func__,n_embd_clip, n_embd_llm);
+            if (clp_ctx_v && clp_ctx_a && n_embd_clip_v != n_embd_clip_a) {
+                fprintf(stderr, "%s: mmproj embedding mismatch between Audio and Vision (%d and %d)! Make sure you use the correct mmproj file!\n", __func__,n_embd_clip_v, n_embd_clip_a);
                 return ModelLoadResult::FAIL;
             }
+
             if(clp_ctx_a) //init audio
             {
                 if (clip_has_whisper_encoder(clp_ctx_a)) {
@@ -2473,9 +2484,9 @@ ModelLoadResult gpttype_load_model(const load_model_inputs inputs, FileFormat in
             {
                 printf("Error: Speculative decoding cannot be used with Recurrent models!\n");
             }
-            else if(clp_ctx_v!=nullptr)
+            else if(clp_ctx_v!=nullptr || clp_ctx_a!=nullptr)
             {
-                printf("Error: Speculative decoding cannot be used with multimodal vision projectors!\n");
+                printf("Error: Speculative decoding cannot be used with multimodal projectors!\n");
             }
             else
             {
@@ -3115,6 +3126,8 @@ static void PrepareMediaEmbds(const int nctx, const std::vector<int> & media_int
                     printf("\nWarning: Audio Embd excluded - Context size too low or not enough clip tokens! (needed %d)\nAudio will be IGNORED! You probably want to relaunch with a larger context size!\n",cliptokensneeded);
                 }
 
+            }else{
+                printf("\nUnhandled media object, something went wrong.\n");
             }
         }
     }

Original file line number	Diff line number	Diff line change
`@@ -2440,19 +2440,30 @@ ModelLoadResult gpttype_load_model(const load_model_inputs inputs, FileFormat in`
`2440`	`2440`	`fprintf(stderr, "%s: error: failed to load mmproj model!\n", __func__);`
`2441`	`2441`	`return ModelLoadResult::FAIL;`
`2442`	`2442`	`}`
`2443`		`- const int n_embd_clip = clip_n_mmproj_embd(clp_ctx_v);`
`2444`	`2443`	`const int n_embd_llm = llama_n_embd(llamamodel);`
`2445`		`- if (clp_ctx_v && clp_ctx_a) {`
`2446`		`- int n_embd_a = clip_n_mmproj_embd(clp_ctx_a);`
`2447`		`- if (n_embd_clip != n_embd_a) {`
`2448`		`- fprintf(stderr, "%s: mmproj embedding mismatch between Audio and Vision (%d and %d)! Make sure you use the correct mmproj file!\n", __func__,n_embd_clip, n_embd_a);`
	`2444`	`+ int n_embd_clip_a = -1;`
	`2445`	`+ int n_embd_clip_v = -1;`
	`2446`	`+ if (clp_ctx_v)`
	`2447`	`+ {`
	`2448`	`+ n_embd_clip_v = clip_n_mmproj_embd(clp_ctx_v);`
	`2449`	`+ if (n_embd_clip_v != n_embd_llm) {`
	`2450`	`+ fprintf(stderr, "%s: mmproj vision embedding mismatch (%d and %d)! Make sure you use the correct mmproj file!\n", __func__,n_embd_clip_v, n_embd_llm);`
	`2451`	`+ return ModelLoadResult::FAIL;`
	`2452`	`+ }`
	`2453`	`+ }`
	`2454`	`+ if (clp_ctx_a)`
	`2455`	`+ {`
	`2456`	`+ n_embd_clip_a = clip_n_mmproj_embd(clp_ctx_a);`
	`2457`	`+ if (n_embd_clip_a != n_embd_llm) {`
	`2458`	`+ fprintf(stderr, "%s: mmproj audio embedding mismatch (%d and %d)! Make sure you use the correct mmproj file!\n", __func__,n_embd_clip_a, n_embd_llm);`
`2449`	`2459`	`return ModelLoadResult::FAIL;`
`2450`	`2460`	`}`
`2451`	`2461`	`}`
`2452`		`- if (n_embd_clip != n_embd_llm) {`
`2453`		`- fprintf(stderr, "%s: mmproj embedding mismatch (%d and %d)! Make sure you use the correct mmproj file!\n", __func__,n_embd_clip, n_embd_llm);`
	`2462`	`+ if (clp_ctx_v && clp_ctx_a && n_embd_clip_v != n_embd_clip_a) {`
	`2463`	`+ fprintf(stderr, "%s: mmproj embedding mismatch between Audio and Vision (%d and %d)! Make sure you use the correct mmproj file!\n", __func__,n_embd_clip_v, n_embd_clip_a);`
`2454`	`2464`	`return ModelLoadResult::FAIL;`
`2455`	`2465`	`}`
	`2466`	`+`
`2456`	`2467`	`if(clp_ctx_a) //init audio`
`2457`	`2468`	`{`
`2458`	`2469`	`if (clip_has_whisper_encoder(clp_ctx_a)) {`
`@@ -2473,9 +2484,9 @@ ModelLoadResult gpttype_load_model(const load_model_inputs inputs, FileFormat in`
`2473`	`2484`	`{`
`2474`	`2485`	`printf("Error: Speculative decoding cannot be used with Recurrent models!\n");`
`2475`	`2486`	`}`
`2476`		`- else if(clp_ctx_v!=nullptr)`
	`2487`	`+ else if(clp_ctx_v!=nullptr \|\| clp_ctx_a!=nullptr)`
`2477`	`2488`	`{`
`2478`		`- printf("Error: Speculative decoding cannot be used with multimodal vision projectors!\n");`
	`2489`	`+ printf("Error: Speculative decoding cannot be used with multimodal projectors!\n");`
`2479`	`2490`	`}`
`2480`	`2491`	`else`
`2481`	`2492`	`{`
`@@ -3115,6 +3126,8 @@ static void PrepareMediaEmbds(const int nctx, const std::vector<int> & media_int`
`3115`	`3126`	`printf("\nWarning: Audio Embd excluded - Context size too low or not enough clip tokens! (needed %d)\nAudio will be IGNORED! You probably want to relaunch with a larger context size!\n",cliptokensneeded);`
`3116`	`3127`	`}`
`3117`	`3128`
	`3129`	`+ }else{`
	`3130`	`+ printf("\nUnhandled media object, something went wrong.\n");`
`3118`	`3131`	`}`
`3119`	`3132`	`}`
`3120`	`3133`	`}`