Skip to content

Commit 12a6088

Browse files
committed
added voxtral support, however without the magic token it hears audio as text
1 parent b8425f5 commit 12a6088

File tree

1 file changed

+22
-9
lines changed

1 file changed

+22
-9
lines changed

gpttype_adapter.cpp

Lines changed: 22 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -2440,19 +2440,30 @@ ModelLoadResult gpttype_load_model(const load_model_inputs inputs, FileFormat in
24402440
fprintf(stderr, "%s: error: failed to load mmproj model!\n", __func__);
24412441
return ModelLoadResult::FAIL;
24422442
}
2443-
const int n_embd_clip = clip_n_mmproj_embd(clp_ctx_v);
24442443
const int n_embd_llm = llama_n_embd(llamamodel);
2445-
if (clp_ctx_v && clp_ctx_a) {
2446-
int n_embd_a = clip_n_mmproj_embd(clp_ctx_a);
2447-
if (n_embd_clip != n_embd_a) {
2448-
fprintf(stderr, "%s: mmproj embedding mismatch between Audio and Vision (%d and %d)! Make sure you use the correct mmproj file!\n", __func__,n_embd_clip, n_embd_a);
2444+
int n_embd_clip_a = -1;
2445+
int n_embd_clip_v = -1;
2446+
if (clp_ctx_v)
2447+
{
2448+
n_embd_clip_v = clip_n_mmproj_embd(clp_ctx_v);
2449+
if (n_embd_clip_v != n_embd_llm) {
2450+
fprintf(stderr, "%s: mmproj vision embedding mismatch (%d and %d)! Make sure you use the correct mmproj file!\n", __func__,n_embd_clip_v, n_embd_llm);
2451+
return ModelLoadResult::FAIL;
2452+
}
2453+
}
2454+
if (clp_ctx_a)
2455+
{
2456+
n_embd_clip_a = clip_n_mmproj_embd(clp_ctx_a);
2457+
if (n_embd_clip_a != n_embd_llm) {
2458+
fprintf(stderr, "%s: mmproj audio embedding mismatch (%d and %d)! Make sure you use the correct mmproj file!\n", __func__,n_embd_clip_a, n_embd_llm);
24492459
return ModelLoadResult::FAIL;
24502460
}
24512461
}
2452-
if (n_embd_clip != n_embd_llm) {
2453-
fprintf(stderr, "%s: mmproj embedding mismatch (%d and %d)! Make sure you use the correct mmproj file!\n", __func__,n_embd_clip, n_embd_llm);
2462+
if (clp_ctx_v && clp_ctx_a && n_embd_clip_v != n_embd_clip_a) {
2463+
fprintf(stderr, "%s: mmproj embedding mismatch between Audio and Vision (%d and %d)! Make sure you use the correct mmproj file!\n", __func__,n_embd_clip_v, n_embd_clip_a);
24542464
return ModelLoadResult::FAIL;
24552465
}
2466+
24562467
if(clp_ctx_a) //init audio
24572468
{
24582469
if (clip_has_whisper_encoder(clp_ctx_a)) {
@@ -2473,9 +2484,9 @@ ModelLoadResult gpttype_load_model(const load_model_inputs inputs, FileFormat in
24732484
{
24742485
printf("Error: Speculative decoding cannot be used with Recurrent models!\n");
24752486
}
2476-
else if(clp_ctx_v!=nullptr)
2487+
else if(clp_ctx_v!=nullptr || clp_ctx_a!=nullptr)
24772488
{
2478-
printf("Error: Speculative decoding cannot be used with multimodal vision projectors!\n");
2489+
printf("Error: Speculative decoding cannot be used with multimodal projectors!\n");
24792490
}
24802491
else
24812492
{
@@ -3115,6 +3126,8 @@ static void PrepareMediaEmbds(const int nctx, const std::vector<int> & media_int
31153126
printf("\nWarning: Audio Embd excluded - Context size too low or not enough clip tokens! (needed %d)\nAudio will be IGNORED! You probably want to relaunch with a larger context size!\n",cliptokensneeded);
31163127
}
31173128

3129+
}else{
3130+
printf("\nUnhandled media object, something went wrong.\n");
31183131
}
31193132
}
31203133
}

0 commit comments

Comments
 (0)