|
26 | 26 | #endif |
27 | 27 |
|
28 | 28 | //imports required for tts.cpp to work |
29 | | -#include "tts.cpp" |
| 29 | +#include "ttscommon.h" |
| 30 | +#include "ttscpp.cpp" |
30 | 31 | #include "ttstokenizer.cpp" |
31 | 32 | #include "ttssampler.cpp" |
32 | 33 | #include "parler_model.cpp" |
@@ -497,6 +498,10 @@ static int code_terminate_id = 151670; |
497 | 498 | static int nthreads = 4; |
498 | 499 | static int tts_max_len = 4096; |
499 | 500 |
|
| 501 | +//ttscpp specific |
| 502 | +static generation_configuration * ttscpp_config = nullptr; |
| 503 | +static struct tts_runner * ttscpp_runner = nullptr; |
| 504 | + |
500 | 505 | int total_tts_gens = 0; |
501 | 506 |
|
502 | 507 | bool ttstype_load_model(const tts_load_model_inputs inputs) |
@@ -532,81 +537,103 @@ bool ttstype_load_model(const tts_load_model_inputs inputs) |
532 | 537 |
|
533 | 538 | std::string modelfile_ttc = inputs.ttc_model_filename; |
534 | 539 | std::string modelfile_cts = inputs.cts_model_filename; |
535 | | - printf("\nLoading TTS Model, OuteTTS: %s \nWavTokenizer: %s \n",modelfile_ttc.c_str(),modelfile_cts.c_str()); |
| 540 | + std::string detectedarch = gguf_get_model_arch(modelfile_ttc); |
| 541 | + |
| 542 | + bool is_ttscpp_file = false; |
| 543 | + if (detectedarch!="" && SUPPORTED_ARCHITECTURES.find(detectedarch) != SUPPORTED_ARCHITECTURES.end()) { |
| 544 | + is_ttscpp_file = true; |
| 545 | + printf("\nLoading TTS.CPP Model Arch: %s \n", detectedarch.c_str()); |
| 546 | + }else{ |
| 547 | + printf("\nLoading OuteTTS Model, OuteTTS: %s \nWavTokenizer: %s \n",modelfile_ttc.c_str(),modelfile_cts.c_str()); |
| 548 | + if(modelfile_ttc=="" || modelfile_cts=="") |
| 549 | + { |
| 550 | + printf("\nWarning: KCPP OuteTTS missing a file! Make sure both TTS and WavTokenizer models are loaded.\n"); |
| 551 | + return false; |
| 552 | + } |
| 553 | + } |
536 | 554 |
|
537 | 555 | ttsdebugmode = inputs.debugmode; |
538 | 556 |
|
539 | 557 | // tts init |
540 | | - llama_model_params tts_model_params = llama_model_default_params(); |
541 | | - llama_context_params tts_ctx_params = llama_context_default_params(); |
542 | | - |
543 | | - nthreads = inputs.threads; |
544 | | - |
545 | | - tts_max_len = inputs.ttsmaxlen; |
546 | | - |
547 | | - tts_model_params.use_mmap = false; |
548 | | - tts_model_params.use_mlock = false; |
549 | | - tts_model_params.n_gpu_layers = inputs.gpulayers; //offload if possible |
550 | | - tts_model_params.split_mode = llama_split_mode::LLAMA_SPLIT_MODE_LAYER; |
551 | | - int kcpp_parseinfo_maindevice = inputs.kcpp_main_gpu<=0?0:inputs.kcpp_main_gpu; |
552 | | - tts_model_params.main_gpu = kcpp_parseinfo_maindevice; |
553 | | - tts_ctx_params.n_ctx = 8192; |
554 | | - tts_ctx_params.offload_kqv = true; |
555 | | - tts_ctx_params.n_batch = 8192; |
556 | | - tts_ctx_params.n_ubatch = 512; |
557 | | - tts_ctx_params.n_threads = nthreads; |
558 | | - tts_ctx_params.n_threads_batch = nthreads; |
559 | | - tts_ctx_params.flash_attn = inputs.flash_attention; |
560 | | - tts_ctx_params.kv_unified = true; |
561 | | - |
562 | | - llama_model * ttcmodel = llama_model_load_from_file(modelfile_ttc.c_str(), tts_model_params); |
563 | | - ttc_ctx = llama_init_from_model(ttcmodel, tts_ctx_params); |
564 | | - |
565 | | - if (ttc_ctx == nullptr) { |
566 | | - printf("\nTTS Load Error: Failed to initialize ttc context!\n"); |
567 | | - return false; |
568 | | - } |
| 558 | + if (is_ttscpp_file) { |
| 559 | + ttscpp_config = new generation_configuration("af_alloy", 50, 1.0, 1.0, true, "", 0, 1.0); |
| 560 | + ttscpp_runner = runner_from_file(modelfile_ttc, inputs.threads, ttscpp_config, true); |
| 561 | + if (ttscpp_runner == nullptr) { |
| 562 | + printf("\nTTS Load Error: Failed to initialize TTSCPP!\n"); |
| 563 | + return false; |
| 564 | + } |
| 565 | + } else { //outetts only |
| 566 | + llama_model_params tts_model_params = llama_model_default_params(); |
| 567 | + llama_context_params tts_ctx_params = llama_context_default_params(); |
| 568 | + |
| 569 | + nthreads = inputs.threads; |
| 570 | + |
| 571 | + tts_max_len = inputs.ttsmaxlen; |
| 572 | + |
| 573 | + tts_model_params.use_mmap = false; |
| 574 | + tts_model_params.use_mlock = false; |
| 575 | + tts_model_params.n_gpu_layers = inputs.gpulayers; //offload if possible |
| 576 | + tts_model_params.split_mode = llama_split_mode::LLAMA_SPLIT_MODE_LAYER; |
| 577 | + int kcpp_parseinfo_maindevice = inputs.kcpp_main_gpu<=0?0:inputs.kcpp_main_gpu; |
| 578 | + tts_model_params.main_gpu = kcpp_parseinfo_maindevice; |
| 579 | + tts_ctx_params.n_ctx = 8192; |
| 580 | + tts_ctx_params.offload_kqv = true; |
| 581 | + tts_ctx_params.n_batch = 8192; |
| 582 | + tts_ctx_params.n_ubatch = 512; |
| 583 | + tts_ctx_params.n_threads = nthreads; |
| 584 | + tts_ctx_params.n_threads_batch = nthreads; |
| 585 | + tts_ctx_params.flash_attn = inputs.flash_attention; |
| 586 | + tts_ctx_params.kv_unified = true; |
| 587 | + |
| 588 | + llama_model * ttcmodel = llama_model_load_from_file(modelfile_ttc.c_str(), tts_model_params); |
| 589 | + ttc_ctx = llama_init_from_model(ttcmodel, tts_ctx_params); |
| 590 | + |
| 591 | + if (ttc_ctx == nullptr) { |
| 592 | + printf("\nTTS Load Error: Failed to initialize ttc context!\n"); |
| 593 | + return false; |
| 594 | + } |
569 | 595 |
|
570 | | - llama_model * ctsmodel = llama_model_load_from_file(modelfile_cts.c_str(), tts_model_params); |
| 596 | + llama_model * ctsmodel = llama_model_load_from_file(modelfile_cts.c_str(), tts_model_params); |
571 | 597 |
|
572 | | - tts_ctx_params.embeddings = true; //this requires embeddings instead |
573 | | - tts_ctx_params.n_ubatch = tts_ctx_params.n_batch; |
574 | | - cts_ctx = llama_init_from_model(ctsmodel, tts_ctx_params); |
| 598 | + tts_ctx_params.embeddings = true; //this requires embeddings instead |
| 599 | + tts_ctx_params.n_ubatch = tts_ctx_params.n_batch; |
| 600 | + cts_ctx = llama_init_from_model(ctsmodel, tts_ctx_params); |
575 | 601 |
|
576 | | - if (cts_ctx == nullptr) { |
577 | | - printf("\nTTS Load Error: Failed to initialize cts context!\n"); |
578 | | - return false; |
579 | | - } |
| 602 | + if (cts_ctx == nullptr) { |
| 603 | + printf("\nTTS Load Error: Failed to initialize cts context!\n"); |
| 604 | + return false; |
| 605 | + } |
580 | 606 |
|
581 | | - std::vector<int> tmp = {1, 2, 3, 4}; |
582 | | - llama_memory_clear(llama_get_memory(ttc_ctx),true); |
583 | | - auto er = llama_decode(ttc_ctx, llama_batch_get_one(tmp.data(), tmp.size())); |
584 | | - if(er!=0) |
585 | | - { |
586 | | - printf("\nTTS Eval returned nonzero: %d\n",er); |
587 | | - return false; |
588 | | - } |
| 607 | + std::vector<int> tmp = {1, 2, 3, 4}; |
| 608 | + llama_memory_clear(llama_get_memory(ttc_ctx),true); |
| 609 | + auto er = llama_decode(ttc_ctx, llama_batch_get_one(tmp.data(), tmp.size())); |
| 610 | + if(er!=0) |
| 611 | + { |
| 612 | + printf("\nTTS Eval returned nonzero: %d\n",er); |
| 613 | + return false; |
| 614 | + } |
589 | 615 |
|
590 | | - const llama_vocab * ttcvocab = llama_model_get_vocab(ttcmodel); |
591 | | - llama_tokens testoks = common_tokenize(ttcvocab,"<|space|>",false,true); |
592 | | - if (testoks.size() == 1) { |
593 | | - ttsver = TTS_VER_3; |
594 | | - printf("\nUsing v0.3 mode"); |
595 | | - //note that the final word does NOT have a space at the end. |
596 | | - space_id = testoks[0]; |
597 | | - testoks = common_tokenize(ttcvocab,"<|audio_end|>",false,true); |
| 616 | + const llama_vocab * ttcvocab = llama_model_get_vocab(ttcmodel); |
| 617 | + llama_tokens testoks = common_tokenize(ttcvocab,"<|space|>",false,true); |
598 | 618 | if (testoks.size() == 1) { |
599 | | - code_terminate_id = testoks[0]; |
| 619 | + ttsver = TTS_VER_3; |
| 620 | + printf("\nUsing v0.3 mode"); |
| 621 | + //note that the final word does NOT have a space at the end. |
| 622 | + space_id = testoks[0]; |
| 623 | + testoks = common_tokenize(ttcvocab,"<|audio_end|>",false,true); |
| 624 | + if (testoks.size() == 1) { |
| 625 | + code_terminate_id = testoks[0]; |
| 626 | + } |
| 627 | + } else { |
| 628 | + ttsver = TTS_VER_2; |
| 629 | + printf("\nUsing v0.2 mode"); |
600 | 630 | } |
601 | | - } else { |
602 | | - ttsver = TTS_VER_2; |
603 | | - printf("\nUsing v0.2 mode"); |
604 | | - } |
605 | 631 |
|
606 | | - //determine offset of <|0|> |
607 | | - testoks = common_tokenize(ttcvocab,"<|0|>",false,true); |
608 | | - if (testoks.size() == 1) { |
609 | | - cts_offset = testoks[0]; |
| 632 | + //determine offset of <|0|> |
| 633 | + testoks = common_tokenize(ttcvocab,"<|0|>",false,true); |
| 634 | + if (testoks.size() == 1) { |
| 635 | + cts_offset = testoks[0]; |
| 636 | + } |
610 | 637 | } |
611 | 638 |
|
612 | 639 | printf("\nTTS Load Complete.\n"); |
|
0 commit comments