Skip to content

Commit 8d961bb

Browse files
committed
all outetts 0.3 models working
1 parent 828a01d commit 8d961bb

File tree

1 file changed

+35
-11
lines changed

1 file changed

+35
-11
lines changed

otherarch/tts_adapter.cpp

Lines changed: 35 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -465,6 +465,9 @@ static int last_generation_settings_speaker_seed;
465465
static int last_generation_settings_audio_seed;
466466
static std::vector<llama_token> last_speaker_codes; //will store cached speaker
467467
static int last_speaker_seed = -999;
468+
static int cts_offset = 151672;
469+
static int space_id = 151670;
470+
static int code_terminate_id = 151670;
468471

469472
bool ttstype_load_model(const tts_load_model_inputs inputs)
470473
{
@@ -552,11 +555,23 @@ bool ttstype_load_model(const tts_load_model_inputs inputs)
552555
if (testoks.size() == 1) {
553556
ttsver = TTS_VER_3;
554557
printf("\nUsing v0.3 mode");
558+
//note that the final word does NOT have a space at the end.
559+
space_id = testoks[0];
560+
testoks = common_tokenize(ttcvocab,"<|audio_end|>",false,true);
561+
if (testoks.size() == 1) {
562+
code_terminate_id = testoks[0];
563+
}
555564
} else {
556565
ttsver = TTS_VER_2;
557566
printf("\nUsing v0.2 mode");
558567
}
559568

569+
//determine offset of <|0|>
570+
testoks = common_tokenize(ttcvocab,"<|0|>",false,true);
571+
if (testoks.size() == 1) {
572+
cts_offset = testoks[0];
573+
}
574+
560575
printf("\nTTS Load Complete.\n");
561576
return true;
562577
}
@@ -711,16 +726,17 @@ tts_generation_outputs ttstype_generate(const tts_generation_inputs inputs)
711726
{
712727
printf("\nGuide Tokens (%d tokens):\n", guide_tokens.size());
713728
const std::string inp_txt = common_detokenize(ttc_ctx, guide_tokens, true);
714-
printf("%s", inp_txt.c_str());
729+
printf("%s,", inp_txt.c_str());
715730
printf("\n");
716731
}
717732
prompt_add(prompt_inp, ttcvocab, sampletext, false, true);
718733
prompt_add(prompt_inp, ttcvocab, "<|text_end|>\n<|audio_start|>\n", false, true);
719734
if(!inputs.quiet && ttsdebugmode==1)
720735
{
721-
printf("\nPrepare new speaker (%d input tokens)...", prompt_inp.size());
736+
printf("\nPrepare new speaker (%d input tokens)...\n", prompt_inp.size());
737+
print_tok_vec(prompt_inp);
722738
}
723-
kcpp_embd_batch tts_batch = kcpp_embd_batch(prompt_inp, 0, false, true);
739+
kcpp_embd_batch tts_batch = kcpp_embd_batch(prompt_inp, 0, false, false);
724740
auto evalok = (llama_decode(ttc_ctx, tts_batch.batch)==0);
725741
if (!evalok) {
726742
printf("\nError: TTS prompt batch processing failed\n");
@@ -773,11 +789,17 @@ tts_generation_outputs ttstype_generate(const tts_generation_inputs inputs)
773789
}
774790
}
775791

776-
//trim everything after final <|code_end|>
777-
auto it = std::find(last_speaker_codes.rbegin(), last_speaker_codes.rend(), 151670);
792+
//trim everything after final <|code_end|> for v2, or <|audio_end|> offset-1 replaced with <|space|> for v3
793+
auto it = std::find(last_speaker_codes.rbegin(), last_speaker_codes.rend(), code_terminate_id);
778794
if (it != last_speaker_codes.rend()) {
779-
// Erase elements after the found 999 (inclusive)
795+
// Erase elements after the found token (inclusive)
780796
last_speaker_codes.erase(it.base(), last_speaker_codes.end());
797+
if(ttsver==TTS_VER_3 && last_speaker_codes.size()>2)
798+
{
799+
last_speaker_codes.pop_back();
800+
last_speaker_codes.pop_back();
801+
last_speaker_codes.push_back(space_id);
802+
}
781803
}
782804
last_speaker_seed = speaker_seed;
783805
if(!inputs.quiet && ttsdebugmode==1)
@@ -817,18 +839,20 @@ tts_generation_outputs ttstype_generate(const tts_generation_inputs inputs)
817839

818840
if(!last_speaker_codes.empty() && speaker_seed > 0) //apply speaker voice output
819841
{
820-
prompt_add(prompt_inp, last_speaker_codes);
842+
prompt_add(prompt_inp, last_speaker_codes);
843+
prompt_add(prompt_inp, ttcvocab, "\n", false, true);
821844
}
822845

823846
if(!inputs.quiet && ttsdebugmode==1)
824847
{
825848
printf("\nDUMP TTS PROMPT (%d tokens):\n", prompt_inp.size());
849+
print_tok_vec(prompt_inp);
826850
const std::string inp_txt = common_detokenize(ttc_ctx, prompt_inp, true);
827851
printf("\n%s\n", inp_txt.c_str());
828852
}
829853

830854
//create batch with tokens for decoding prompt processing
831-
kcpp_embd_batch tts_batch = kcpp_embd_batch(prompt_inp, 0, false, true);
855+
kcpp_embd_batch tts_batch = kcpp_embd_batch(prompt_inp, 0, false, false);
832856

833857
auto evalok = (llama_decode(ttc_ctx, tts_batch.batch)==0);
834858
if (!evalok) {
@@ -897,10 +921,10 @@ tts_generation_outputs ttstype_generate(const tts_generation_inputs inputs)
897921
}
898922

899923
// remove all non-audio tokens (i.e. < 151672 || > 155772)
900-
codes.erase(std::remove_if(codes.begin(), codes.end(), [](llama_token t) { return t < 151672 || t > 155772; }), codes.end());
924+
codes.erase(std::remove_if(codes.begin(), codes.end(), [](llama_token t) { return t < cts_offset || t > (cts_offset+4100); }), codes.end());
901925

902926
for (auto & token : codes) {
903-
token -= 151672;
927+
token -= cts_offset;
904928
}
905929

906930
const int n_codes = codes.size();
@@ -939,7 +963,7 @@ tts_generation_outputs ttstype_generate(const tts_generation_inputs inputs)
939963
audio[i] = 0.0f;
940964
}
941965
//add some silence at the end
942-
for (int i = 0; i < t_sr/10; ++i) {
966+
for (int i = 0; i < cutout; ++i) {
943967
audio.push_back(0.0f);
944968
}
945969

0 commit comments

Comments
 (0)