Skip to content

Commit bc04366

Browse files
committed
builds but crashes
1 parent 2bf1285 commit bc04366

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

43 files changed

+12183
-2
lines changed

Makefile

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -55,8 +55,8 @@ ifdef KCPP_SANITIZE
5555
CFLAGS += -fsanitize=undefined -fsanitize-undefined-trap-on-error
5656
CXXFLAGS += -fsanitize=undefined -fsanitize-undefined-trap-on-error
5757
endif
58-
CFLAGS += -I. -Iggml/include -Iggml/src -Iggml/src/ggml-cpu -Iinclude -Isrc -I./common -I./vendor -I./vendor/stb -I./include -I./include/CL -I./otherarch -I./otherarch/tools -I./otherarch/sdcpp -I./otherarch/sdcpp/thirdparty -I./include/vulkan -O3 -fno-finite-math-only -std=c11 -fPIC -DLOG_DISABLE_LOGS -D_GNU_SOURCE -DGGML_USE_CPU -DGGML_USE_CPU_REPACK
59-
CXXFLAGS += -I. -Iggml/include -Iggml/src -Iggml/src/ggml-cpu -Iinclude -Isrc -I./common -I./vendor -I./vendor/stb -I./include -I./include/CL -I./otherarch -I./otherarch/tools -I./otherarch/sdcpp -I./otherarch/sdcpp/thirdparty -I./include/vulkan -O3 -fno-finite-math-only -std=c++17 -fPIC -DLOG_DISABLE_LOGS -D_GNU_SOURCE -DGGML_USE_CPU -DGGML_USE_CPU_REPACK
58+
CFLAGS += -I. -Iggml/include -Iggml/src -Iggml/src/ggml-cpu -Iinclude -Isrc -I./common -I./vendor -I./vendor/stb -I./include -I./include/CL -I./otherarch -I./otherarch/tools -I./otherarch/sdcpp -I./otherarch/ttscpp/include -I./otherarch/ttscpp/src -I./otherarch/sdcpp/thirdparty -I./include/vulkan -O3 -fno-finite-math-only -std=c11 -fPIC -DLOG_DISABLE_LOGS -D_GNU_SOURCE -DGGML_USE_CPU -DGGML_USE_CPU_REPACK
59+
CXXFLAGS += -I. -Iggml/include -Iggml/src -Iggml/src/ggml-cpu -Iinclude -Isrc -I./common -I./vendor -I./vendor/stb -I./include -I./include/CL -I./otherarch -I./otherarch/tools -I./otherarch/sdcpp -I./otherarch/ttscpp/include -I./otherarch/ttscpp/src -I./otherarch/sdcpp/thirdparty -I./include/vulkan -O3 -fno-finite-math-only -std=c++17 -fPIC -DLOG_DISABLE_LOGS -D_GNU_SOURCE -DGGML_USE_CPU -DGGML_USE_CPU_REPACK
6060
ifndef KCPP_DEBUG
6161
CFLAGS += -DNDEBUG -s
6262
CXXFLAGS += -DNDEBUG -s
@@ -729,6 +729,8 @@ mainvk: tools/main/main.cpp common/arg.cpp build-info.h ggml_v4_vulkan.o ggml-cp
729729
$(CXX) $(CXXFLAGS) -DGGML_USE_VULKAN -DSD_USE_VULKAN $(filter-out %.h,$^) -o $@ $(LDFLAGS)
730730
embedding: examples/embedding/embedding.cpp common/arg.cpp build-info.h ggml.o ggml-cpu.o ggml-ops.o ggml-vec.o ggml-binops.o ggml-unops.o llama.o console.o llavaclip_default.o llava.o ggml-backend_default.o ggml-backend-reg_default.o ggml-repack.o $(OBJS_FULL) $(OBJS)
731731
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
732+
ttscppmain: otherarch/ttscpp/examples/cli/cli.cpp otherarch/ttscpp/examples/cli/playback.cpp otherarch/ttscpp/examples/cli/playback.h otherarch/ttscpp/examples/cli/write_file.cpp otherarch/ttscpp/examples/cli/write_file.h otherarch/ttscpp/examples/cli/vad.cpp otherarch/ttscpp/examples/cli/vad.h otherarch/ttscpp/src/tts.cpp otherarch/ttscpp/src/tokenizer.cpp otherarch/ttscpp/src/sampler.cpp otherarch/ttscpp/src/parler_model.cpp otherarch/ttscpp/src/dac_model.cpp otherarch/ttscpp/src/ttsutil.cpp otherarch/ttscpp/src/args.cpp otherarch/ttscpp/src/t5_encoder_model.cpp otherarch/ttscpp/src/phonemizer.cpp otherarch/ttscpp/src/tts_model.cpp otherarch/ttscpp/src/kokoro_model.cpp otherarch/ttscpp/src/dia_model.cpp otherarch/ttscpp/src/orpheus_model.cpp otherarch/ttscpp/src/snac_model.cpp otherarch/ttscpp/src/general_neural_audio_codec.cpp ggml.o ggml-cpu.o ggml-ops.o ggml-vec.o ggml-binops.o ggml-unops.o llama.o console.o llavaclip_default.o llava.o ggml-backend_default.o ggml-backend-reg_default.o ggml-repack.o $(OBJS_FULL) $(OBJS)
733+
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
732734

733735
ggml/src/ggml-vulkan-shaders.cpp:
734736
ifdef VULKAN_BUILD

README.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -191,6 +191,7 @@ and it will install everything required. Alternatively, you can download the abo
191191
- KoboldCpp code and other files are also under the AGPL v3.0 License unless otherwise stated
192192
- Llama.cpp source repo is at https://github.com/ggml-org/llama.cpp (MIT)
193193
- Stable-diffusion.cpp source repo is at https://github.com/leejet/stable-diffusion.cpp (MIT)
194+
- TTS.cpp source repo is at https://github.com/mmwillet/TTS.cpp (MIT)
194195
- KoboldCpp source repo is at https://github.com/LostRuins/koboldcpp (AGPL)
195196
- KoboldAI Lite source repo is at https://github.com/LostRuins/lite.koboldai.net (AGPL)
196197
- For any further enquiries, contact @concedo on discord, or LostRuins on github.

otherarch/ttscpp/TTSCPP_LICENSE

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,24 @@
1+
The original TTS.cpp is made by mmwillet, repo can be found at https://github.com/mmwillet/TTS.cpp
2+
KoboldCpp uses a minimal implementation with some files removed.
3+
4+
MIT License
5+
6+
Copyright (c) 2023-2024 The ggml authors
7+
8+
Permission is hereby granted, free of charge, to any person obtaining a copy
9+
of this software and associated documentation files (the "Software"), to deal
10+
in the Software without restriction, including without limitation the rights
11+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
12+
copies of the Software, and to permit persons to whom the Software is
13+
furnished to do so, subject to the following conditions:
14+
15+
The above copyright notice and this permission notice shall be included in all
16+
copies or substantial portions of the Software.
17+
18+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
19+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
20+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
21+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
22+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
23+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
24+
SOFTWARE.
Lines changed: 96 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,96 @@
1+
#include "tts.h"
2+
#include "ttsargs.h"
3+
#include "ttscommon.h"
4+
#include "playback.h"
5+
#include "vad.h"
6+
#include "write_file.h"
7+
#include <thread>
8+
9+
class tts_timing_printer {
10+
const int64_t start_us{[] {
11+
ggml_time_init();
12+
return ggml_time_us();
13+
}()};
14+
public:
15+
~tts_timing_printer() {
16+
const int64_t end_us{ggml_time_us()};
17+
// Just a simple "total time" for now before adding "load" / "prompt eval" / "eval" from llama_print_timings
18+
printf("total time = %.2f ms\n", (end_us - start_us) / 1000.0f);
19+
}
20+
};
21+
22+
int main(int argc, const char ** argv) {
23+
const tts_timing_printer _{};
24+
float default_temperature = 1.0f;
25+
int default_n_threads = std::max((int)std::thread::hardware_concurrency(), 1);
26+
int default_top_k = 50;
27+
int default_max_tokens = 0;
28+
float default_repetition_penalty = 1.0f;
29+
float default_top_p = 1.0f;
30+
arg_list args;
31+
args.add_argument(string_arg("--model-path", "(REQUIRED) The local path of the gguf model file for Parler TTS mini or large v1, Dia, or Kokoro.", "-mp", true));
32+
args.add_argument(string_arg("--prompt", "(REQUIRED) The text prompt for which to generate audio in quotation markers.", "-p", true));
33+
args.add_argument(string_arg("--save-path", "(OPTIONAL) The path to save the audio output to in a .wav format. Defaults to TTS.cpp.wav", "-sp", false, "TTS.cpp.wav"));
34+
args.add_argument(float_arg("--temperature", "The temperature to use when generating outputs. Defaults to 1.0.", "-t", false, &default_temperature));
35+
args.add_argument(int_arg("--n-threads", "The number of cpu threads to run generation with. Defaults to hardware concurrency. If hardware concurrency cannot be determined then it defaults to 1.", "-nt", false, &default_n_threads));
36+
args.add_argument(int_arg("--topk", "(OPTIONAL) When set to an integer value greater than 0 generation uses nucleus sampling over topk nucleaus size. Defaults to 50.", "-tk", false, &default_top_k));
37+
args.add_argument(float_arg("--repetition-penalty", "The by channel repetition penalty to be applied the sampled output of the model. defaults to 1.0.", "-r", false, &default_repetition_penalty));
38+
args.add_argument(bool_arg("--use-metal", "(OPTIONAL) Whether to use metal acceleration", "-m"));
39+
args.add_argument(bool_arg("--no-cross-attn", "(OPTIONAL) Whether to not include cross attention", "-ca"));
40+
args.add_argument(string_arg("--conditional-prompt", "(OPTIONAL) A distinct conditional prompt to use for generating. If none is provided the preencoded prompt is used. '--text-encoder-path' must be set to use conditional generation.", "-cp", false));
41+
args.add_argument(string_arg("--text-encoder-path", "(OPTIONAL) The local path of the text encoder gguf model for conditional generaiton.", "-tep", false));
42+
args.add_argument(string_arg("--voice", "(OPTIONAL) The voice to use to generate the audio. This is only used for models with voice packs.", "-v", false, "af_alloy"));
43+
args.add_argument(bool_arg("--vad", "(OPTIONAL) whether to apply voice inactivity detection (VAD) and strip silence form the end of the output (particularly useful for Parler TTS). By default, no VAD is applied.", "-va"));
44+
args.add_argument(string_arg("--espeak-voice-id", "(OPTIONAL) The espeak voice id to use for phonemization. This should only be specified when the correct espeak voice cannot be inferred from the kokoro voice ( see MultiLanguage Configuration in the README for more info).", "-eid", false));
45+
args.add_argument(int_arg("--max-tokens", "(OPTIONAL) The max audio tokens or token batches to generate where each represents approximates 11 ms of audio. Only applied to Dia generation. If set to zero as is its default then the default max generation size. Warning values under 15 are not supported.", "-mt", false, &default_max_tokens));
46+
args.add_argument(float_arg("--top-p", "(OPTIONAL) the sum of probabilities to sample over. Must be a value between 0.0 and 1.0. Defaults to 1.0.", "-tp", false, &default_top_p));
47+
register_play_tts_response_args(args);
48+
args.parse(argc, argv);
49+
if (args.for_help) {
50+
args.help();
51+
exit(0);
52+
}
53+
args.validate();
54+
55+
std::string conditional_prompt = args.get_string_param("--conditional-prompt");
56+
std::string text_encoder_path = args.get_string_param("--text-encoder-path");
57+
if (conditional_prompt.size() > 0 && text_encoder_path.size() <= 0) {
58+
fprintf(stderr, "The '--text-encoder-path' must be specified when '--condtional-prompt' is passed.\n");
59+
exit(1);
60+
}
61+
62+
if (*args.get_float_param("--top-p") > 1.0f || *args.get_float_param("--top-p") <= 0.0f) {
63+
fprintf(stderr, "The '--top-p' value must be between 0.0 and 1.0. It was set to '%.6f'.\n", *args.get_float_param("--top-p"));
64+
exit(1);
65+
}
66+
67+
generation_configuration * config = new generation_configuration(
68+
args.get_string_param("--voice"),
69+
*args.get_int_param("--topk"),
70+
*args.get_float_param("--temperature"),
71+
*args.get_float_param("--repetition-penalty"),
72+
!args.get_bool_param("--no-cross-attn"),
73+
args.get_string_param("--espeak-voice-id"),
74+
*args.get_int_param("--max-tokens"),
75+
*args.get_float_param("--top-p"));
76+
77+
struct tts_runner * runner = runner_from_file(args.get_string_param("--model-path"), *args.get_int_param("--n-threads"), config, !args.get_bool_param("--use-metal"));
78+
79+
if (conditional_prompt.size() > 0) {
80+
update_conditional_prompt(runner, text_encoder_path, conditional_prompt, true);
81+
}
82+
tts_response data;
83+
84+
generate(runner, args.get_string_param("--prompt"), &data, config);
85+
if (data.n_outputs == 0) {
86+
fprintf(stderr, "Got empty response for prompt, '%s'.\n", args.get_string_param("--prompt").c_str());
87+
exit(1);
88+
}
89+
if (args.get_bool_param("--vad")) {
90+
apply_energy_voice_inactivity_detection(data, runner->sampling_rate);
91+
}
92+
if (!play_tts_response(args, data, runner->sampling_rate)) {
93+
write_audio_file(data, args.get_string_param("--save-path"), runner->sampling_rate);
94+
}
95+
return 0;
96+
}
Lines changed: 62 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,62 @@
1+
#include <cstdint>
2+
#include "playback.h"
3+
4+
#ifndef SDL2_INSTALL
5+
void register_play_tts_response_args(arg_list & args) {
6+
// Hide --play
7+
}
8+
9+
bool play_tts_response(arg_list & args, const tts_response & data, float sample_rate) {
10+
return false;
11+
}
12+
#else
13+
#include "SDL.h"
14+
void register_play_tts_response_args(arg_list & args) {
15+
args.add_argument(bool_arg("--play", "(OPTIONAL) Whether to play back the audio immediately instead of saving it to file."));
16+
}
17+
18+
bool play_tts_response(arg_list & args, const tts_response & data, float sample_rate) {
19+
if (!args.get_bool_param("--play")) {
20+
return false;
21+
}
22+
23+
if (SDL_Init(SDL_INIT_AUDIO)) {
24+
fprintf(stderr, "SDL_INIT failed\n");
25+
exit(1);
26+
}
27+
28+
const SDL_AudioSpec desired{
29+
.freq = static_cast<int>(sample_rate),
30+
.format = AUDIO_F32,
31+
.channels = 1,
32+
.silence = 0,
33+
.padding = 0,
34+
.size = static_cast<unsigned>(data.n_outputs),
35+
.callback = nullptr,
36+
.userdata = nullptr,
37+
};
38+
const SDL_AudioDeviceID dev = SDL_OpenAudioDevice(nullptr, false, &desired, nullptr, 0);
39+
if (!dev) {
40+
fprintf(stderr, "SDL_OpenAudioDevice failed\n");
41+
exit(1);
42+
}
43+
44+
SDL_PauseAudioDevice(dev, false);
45+
fprintf(stdout, "Playing %ld samples of audio\n", data.n_outputs);
46+
if (SDL_QueueAudio(dev, data.data, data.n_outputs * sizeof(data.data[0]))) {
47+
fprintf(stderr, "SDL_QueueAudio failed\n");
48+
exit(1);
49+
}
50+
51+
SDL_Event event;
52+
while (SDL_GetQueuedAudioSize(dev)) {
53+
if (SDL_PollEvent(&event) && event.type == SDL_QUIT) break;
54+
SDL_Delay(100);
55+
}
56+
57+
SDL_CloseAudioDevice(dev);
58+
SDL_Quit();
59+
60+
return true;
61+
}
62+
#endif
Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
#pragma once
2+
3+
#include "ttsargs.h"
4+
#include "ttscommon.h"
5+
6+
void register_play_tts_response_args(arg_list & args);
7+
bool play_tts_response(arg_list & args, const tts_response & data, float sample_rate);
Lines changed: 68 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,68 @@
1+
#include "vad.h"
2+
3+
float energy(float * chunk, int count) {
4+
float en = 0.0f;
5+
for (int i = 0; i < count; i++) {
6+
en += powf(chunk[i], 2.0f);
7+
}
8+
return en;
9+
}
10+
11+
void apply_energy_voice_inactivity_detection(
12+
tts_response & data,
13+
float sample_rate,
14+
int ms_per_frame,
15+
int frame_threshold,
16+
float normalized_energy_threshold,
17+
int trailing_silent_frames,
18+
int early_cutoff_seconds_threshold,
19+
float early_cutoff_energy_threshold) {
20+
int samples_per_frame = (int) (ms_per_frame * sample_rate / 1000.0f);
21+
int n_frames = (int) (data.n_outputs / samples_per_frame);
22+
int early_cuttoff_frames = (int)((early_cutoff_seconds_threshold * 1000) / ms_per_frame);
23+
24+
// for min-max normalization
25+
float max_energy = 0.0f;
26+
float min_energy = 0.0f;
27+
float * energies = (float *) malloc(n_frames * sizeof(float));
28+
int silent_frames = 0;
29+
30+
// compute the energies and the necessary elements for min-max normalization
31+
for (int i = 0; i < n_frames; i++) {
32+
float * chunk = data.data + i * samples_per_frame;
33+
energies[i] = energy(chunk, samples_per_frame);
34+
if (i == 0) {
35+
max_energy = energies[i];
36+
min_energy = energies[i];
37+
} else if (energies[i] > max_energy) {
38+
max_energy = energies[i];
39+
} else if (energies[i] < min_energy) {
40+
min_energy = energies[i];
41+
}
42+
if (energies[i] <= early_cutoff_energy_threshold) {
43+
silent_frames++;
44+
} else {
45+
silent_frames = 0;
46+
}
47+
if (silent_frames >= early_cuttoff_frames) {
48+
data.n_outputs = (i + trailing_silent_frames - silent_frames) * samples_per_frame;
49+
free(energies);
50+
return;
51+
}
52+
}
53+
54+
int concurrent_silent_frames = 0;
55+
56+
for (int i = n_frames; i > 0; i--) {
57+
float frame_energy = (energies[i-1] - min_energy) / (max_energy - min_energy);
58+
if (frame_energy < normalized_energy_threshold) {
59+
concurrent_silent_frames++;
60+
} else {
61+
break;
62+
}
63+
}
64+
if (concurrent_silent_frames >= frame_threshold) {
65+
data.n_outputs -= ((concurrent_silent_frames - trailing_silent_frames) * samples_per_frame);
66+
}
67+
free(energies);
68+
}
Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,21 @@
1+
#pragma once
2+
3+
#include <math.h>
4+
#include "ttscommon.h"
5+
6+
float energy(float * chunk, int count);
7+
8+
/*
9+
* This function is used to trim trailing silence at the end of audio data within the tts_response struct.
10+
* It detects silence by min-max normalizing energy and trimming frames which fall under a relative threshold.
11+
*/
12+
void apply_energy_voice_inactivity_detection(
13+
tts_response & data,
14+
float sample_rate = 44100.0f, // the sample rate of the audio
15+
int ms_per_frame = 10, // the audio time per frame
16+
int frame_threshold = 20, // the number of trailing empty frames upon which silence is clipped.
17+
float normalized_energy_threshold = 0.01f, // the normalized threshold to determine a silent frame
18+
int trailing_silent_frames = 5, // the number of frames of silence to allow
19+
int early_cutoff_seconds_threshold = 3, // the number of seconds of complete silence before terminating and cutting audio early
20+
float early_cutoff_energy_threshold = 0.1 // the energy threshold for treating a frame as silent for early cutoff
21+
);
Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,12 @@
1+
#include <cstdint>
2+
#include "write_file.h"
3+
#include "audio_file.h"
4+
5+
void write_audio_file(const tts_response & data, std::string path, float sample_rate) {
6+
fprintf(stdout, "Writing audio file: %s\n", path.c_str());
7+
AudioFile<float> file;
8+
file.setSampleRate(sample_rate);
9+
file.samples[0] = std::vector(data.data, data.data + data.n_outputs);
10+
file.save(path, AudioFileFormat::Wave);
11+
file.printSummary();
12+
}
Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
#pragma once
2+
3+
#include "ttscommon.h"
4+
5+
void write_audio_file(const tts_response & data, std::string path = "TTS.cpp.wav", float sample_rate = 44100.0f);

0 commit comments

Comments
 (0)