@@ -465,6 +465,9 @@ static int last_generation_settings_speaker_seed;
465465static int last_generation_settings_audio_seed;
466466static std::vector<llama_token> last_speaker_codes; // will store cached speaker
467467static int last_speaker_seed = -999 ;
468+ static int cts_offset = 151672 ;
469+ static int space_id = 151670 ;
470+ static int code_terminate_id = 151670 ;
468471
469472bool ttstype_load_model (const tts_load_model_inputs inputs)
470473{
@@ -552,11 +555,23 @@ bool ttstype_load_model(const tts_load_model_inputs inputs)
552555 if (testoks.size () == 1 ) {
553556 ttsver = TTS_VER_3;
554557 printf (" \n Using v0.3 mode" );
558+ // note that the final word does NOT have a space at the end.
559+ space_id = testoks[0 ];
560+ testoks = common_tokenize (ttcvocab," <|audio_end|>" ,false ,true );
561+ if (testoks.size () == 1 ) {
562+ code_terminate_id = testoks[0 ];
563+ }
555564 } else {
556565 ttsver = TTS_VER_2;
557566 printf (" \n Using v0.2 mode" );
558567 }
559568
569+ // determine offset of <|0|>
570+ testoks = common_tokenize (ttcvocab," <|0|>" ,false ,true );
571+ if (testoks.size () == 1 ) {
572+ cts_offset = testoks[0 ];
573+ }
574+
560575 printf (" \n TTS Load Complete.\n " );
561576 return true ;
562577}
@@ -711,16 +726,17 @@ tts_generation_outputs ttstype_generate(const tts_generation_inputs inputs)
711726 {
712727 printf (" \n Guide Tokens (%d tokens):\n " , guide_tokens.size ());
713728 const std::string inp_txt = common_detokenize (ttc_ctx, guide_tokens, true );
714- printf (" %s" , inp_txt.c_str ());
729+ printf (" %s, " , inp_txt.c_str ());
715730 printf (" \n " );
716731 }
717732 prompt_add (prompt_inp, ttcvocab, sampletext, false , true );
718733 prompt_add (prompt_inp, ttcvocab, " <|text_end|>\n <|audio_start|>\n " , false , true );
719734 if (!inputs.quiet && ttsdebugmode==1 )
720735 {
721- printf (" \n Prepare new speaker (%d input tokens)..." , prompt_inp.size ());
736+ printf (" \n Prepare new speaker (%d input tokens)...\n " , prompt_inp.size ());
737+ print_tok_vec (prompt_inp);
722738 }
723- kcpp_embd_batch tts_batch = kcpp_embd_batch (prompt_inp, 0 , false , true );
739+ kcpp_embd_batch tts_batch = kcpp_embd_batch (prompt_inp, 0 , false , false );
724740 auto evalok = (llama_decode (ttc_ctx, tts_batch.batch )==0 );
725741 if (!evalok) {
726742 printf (" \n Error: TTS prompt batch processing failed\n " );
@@ -773,11 +789,17 @@ tts_generation_outputs ttstype_generate(const tts_generation_inputs inputs)
773789 }
774790 }
775791
776- // trim everything after final <|code_end|>
777- auto it = std::find (last_speaker_codes.rbegin (), last_speaker_codes.rend (), 151670 );
792+ // trim everything after final <|code_end|> for v2, or <|audio_end|> offset-1 replaced with <|space|> for v3
793+ auto it = std::find (last_speaker_codes.rbegin (), last_speaker_codes.rend (), code_terminate_id );
778794 if (it != last_speaker_codes.rend ()) {
779- // Erase elements after the found 999 (inclusive)
795+ // Erase elements after the found token (inclusive)
780796 last_speaker_codes.erase (it.base (), last_speaker_codes.end ());
797+ if (ttsver==TTS_VER_3 && last_speaker_codes.size ()>2 )
798+ {
799+ last_speaker_codes.pop_back ();
800+ last_speaker_codes.pop_back ();
801+ last_speaker_codes.push_back (space_id);
802+ }
781803 }
782804 last_speaker_seed = speaker_seed;
783805 if (!inputs.quiet && ttsdebugmode==1 )
@@ -817,18 +839,20 @@ tts_generation_outputs ttstype_generate(const tts_generation_inputs inputs)
817839
818840 if (!last_speaker_codes.empty () && speaker_seed > 0 ) // apply speaker voice output
819841 {
820- prompt_add (prompt_inp, last_speaker_codes);
842+ prompt_add (prompt_inp, last_speaker_codes);
843+ prompt_add (prompt_inp, ttcvocab, " \n " , false , true );
821844 }
822845
823846 if (!inputs.quiet && ttsdebugmode==1 )
824847 {
825848 printf (" \n DUMP TTS PROMPT (%d tokens):\n " , prompt_inp.size ());
849+ print_tok_vec (prompt_inp);
826850 const std::string inp_txt = common_detokenize (ttc_ctx, prompt_inp, true );
827851 printf (" \n %s\n " , inp_txt.c_str ());
828852 }
829853
830854 // create batch with tokens for decoding prompt processing
831- kcpp_embd_batch tts_batch = kcpp_embd_batch (prompt_inp, 0 , false , true );
855+ kcpp_embd_batch tts_batch = kcpp_embd_batch (prompt_inp, 0 , false , false );
832856
833857 auto evalok = (llama_decode (ttc_ctx, tts_batch.batch )==0 );
834858 if (!evalok) {
@@ -897,10 +921,10 @@ tts_generation_outputs ttstype_generate(const tts_generation_inputs inputs)
897921 }
898922
899923 // remove all non-audio tokens (i.e. < 151672 || > 155772)
900- codes.erase (std::remove_if (codes.begin (), codes.end (), [](llama_token t) { return t < 151672 || t > 155772 ; }), codes.end ());
924+ codes.erase (std::remove_if (codes.begin (), codes.end (), [](llama_token t) { return t < cts_offset || t > (cts_offset+ 4100 ) ; }), codes.end ());
901925
902926 for (auto & token : codes) {
903- token -= 151672 ;
927+ token -= cts_offset ;
904928 }
905929
906930 const int n_codes = codes.size ();
@@ -939,7 +963,7 @@ tts_generation_outputs ttstype_generate(const tts_generation_inputs inputs)
939963 audio[i] = 0 .0f ;
940964 }
941965 // add some silence at the end
942- for (int i = 0 ; i < t_sr/ 10 ; ++i) {
966+ for (int i = 0 ; i < cutout ; ++i) {
943967 audio.push_back (0 .0f );
944968 }
945969
0 commit comments