LostRuins
diff --git a/‎Makefile‎
Lines changed: 4 additions & 2 deletions b/‎Makefile‎
Lines changed: 4 additions & 2 deletions
diff --git a/‎README.md‎
Lines changed: 1 addition & 0 deletions b/‎README.md‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎otherarch/ttscpp/TTSCPP_LICENSE‎
Lines changed: 24 additions & 0 deletions b/‎otherarch/ttscpp/TTSCPP_LICENSE‎
Lines changed: 24 additions & 0 deletions
diff --git a/‎otherarch/ttscpp/examples/cli/cli.cpp‎
Lines changed: 96 additions & 0 deletions b/‎otherarch/ttscpp/examples/cli/cli.cpp‎
Lines changed: 96 additions & 0 deletions
diff --git a/‎otherarch/ttscpp/examples/cli/playback.cpp‎
Lines changed: 62 additions & 0 deletions b/‎otherarch/ttscpp/examples/cli/playback.cpp‎
Lines changed: 62 additions & 0 deletions
diff --git a/‎otherarch/ttscpp/examples/cli/playback.h‎
Lines changed: 7 additions & 0 deletions b/‎otherarch/ttscpp/examples/cli/playback.h‎
Lines changed: 7 additions & 0 deletions
diff --git a/‎otherarch/ttscpp/examples/cli/vad.cpp‎
Lines changed: 68 additions & 0 deletions b/‎otherarch/ttscpp/examples/cli/vad.cpp‎
Lines changed: 68 additions & 0 deletions
diff --git a/‎otherarch/ttscpp/examples/cli/vad.h‎
Lines changed: 21 additions & 0 deletions b/‎otherarch/ttscpp/examples/cli/vad.h‎
Lines changed: 21 additions & 0 deletions
diff --git a/‎otherarch/ttscpp/examples/cli/write_file.cpp‎
Lines changed: 12 additions & 0 deletions b/‎otherarch/ttscpp/examples/cli/write_file.cpp‎
Lines changed: 12 additions & 0 deletions
diff --git a/‎otherarch/ttscpp/examples/cli/write_file.h‎
Lines changed: 5 additions & 0 deletions b/‎otherarch/ttscpp/examples/cli/write_file.h‎
Lines changed: 5 additions & 0 deletions
@@ -55,8 +55,8 @@ ifdef KCPP_SANITIZE
 	CFLAGS += -fsanitize=undefined -fsanitize-undefined-trap-on-error
 	CXXFLAGS += -fsanitize=undefined -fsanitize-undefined-trap-on-error
 endif
-CFLAGS   += -I. -Iggml/include -Iggml/src -Iggml/src/ggml-cpu -Iinclude -Isrc -I./common -I./vendor -I./vendor/stb -I./include -I./include/CL -I./otherarch -I./otherarch/tools -I./otherarch/sdcpp -I./otherarch/sdcpp/thirdparty -I./include/vulkan -O3 -fno-finite-math-only -std=c11 -fPIC -DLOG_DISABLE_LOGS -D_GNU_SOURCE -DGGML_USE_CPU -DGGML_USE_CPU_REPACK
-CXXFLAGS += -I. -Iggml/include -Iggml/src -Iggml/src/ggml-cpu -Iinclude -Isrc -I./common -I./vendor -I./vendor/stb -I./include -I./include/CL -I./otherarch -I./otherarch/tools -I./otherarch/sdcpp -I./otherarch/sdcpp/thirdparty -I./include/vulkan -O3 -fno-finite-math-only -std=c++17 -fPIC -DLOG_DISABLE_LOGS -D_GNU_SOURCE -DGGML_USE_CPU -DGGML_USE_CPU_REPACK
+CFLAGS   += -I. -Iggml/include -Iggml/src -Iggml/src/ggml-cpu -Iinclude -Isrc -I./common -I./vendor -I./vendor/stb -I./include -I./include/CL -I./otherarch -I./otherarch/tools -I./otherarch/sdcpp -I./otherarch/ttscpp/include -I./otherarch/ttscpp/src -I./otherarch/sdcpp/thirdparty -I./include/vulkan -O3 -fno-finite-math-only -std=c11 -fPIC -DLOG_DISABLE_LOGS -D_GNU_SOURCE -DGGML_USE_CPU -DGGML_USE_CPU_REPACK
+CXXFLAGS += -I. -Iggml/include -Iggml/src -Iggml/src/ggml-cpu -Iinclude -Isrc -I./common -I./vendor -I./vendor/stb -I./include -I./include/CL -I./otherarch -I./otherarch/tools -I./otherarch/sdcpp -I./otherarch/ttscpp/include -I./otherarch/ttscpp/src -I./otherarch/sdcpp/thirdparty -I./include/vulkan -O3 -fno-finite-math-only -std=c++17 -fPIC -DLOG_DISABLE_LOGS -D_GNU_SOURCE -DGGML_USE_CPU -DGGML_USE_CPU_REPACK
 ifndef KCPP_DEBUG
 	CFLAGS += -DNDEBUG -s
 	CXXFLAGS += -DNDEBUG -s
@@ -729,6 +729,8 @@ mainvk: tools/main/main.cpp common/arg.cpp build-info.h ggml_v4_vulkan.o ggml-cp
 	$(CXX) $(CXXFLAGS) -DGGML_USE_VULKAN -DSD_USE_VULKAN $(filter-out %.h,$^) -o $@ $(LDFLAGS)
 embedding: examples/embedding/embedding.cpp common/arg.cpp build-info.h ggml.o ggml-cpu.o ggml-ops.o ggml-vec.o ggml-binops.o ggml-unops.o llama.o console.o llavaclip_default.o llava.o ggml-backend_default.o ggml-backend-reg_default.o ggml-repack.o $(OBJS_FULL) $(OBJS)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
+ttscppmain: otherarch/ttscpp/examples/cli/cli.cpp otherarch/ttscpp/examples/cli/playback.cpp otherarch/ttscpp/examples/cli/playback.h otherarch/ttscpp/examples/cli/write_file.cpp otherarch/ttscpp/examples/cli/write_file.h otherarch/ttscpp/examples/cli/vad.cpp otherarch/ttscpp/examples/cli/vad.h otherarch/ttscpp/src/tts.cpp otherarch/ttscpp/src/tokenizer.cpp otherarch/ttscpp/src/sampler.cpp otherarch/ttscpp/src/parler_model.cpp otherarch/ttscpp/src/dac_model.cpp otherarch/ttscpp/src/ttsutil.cpp otherarch/ttscpp/src/args.cpp otherarch/ttscpp/src/t5_encoder_model.cpp otherarch/ttscpp/src/phonemizer.cpp otherarch/ttscpp/src/tts_model.cpp otherarch/ttscpp/src/kokoro_model.cpp otherarch/ttscpp/src/dia_model.cpp otherarch/ttscpp/src/orpheus_model.cpp otherarch/ttscpp/src/snac_model.cpp otherarch/ttscpp/src/general_neural_audio_codec.cpp ggml.o ggml-cpu.o ggml-ops.o ggml-vec.o ggml-binops.o ggml-unops.o llama.o console.o llavaclip_default.o llava.o ggml-backend_default.o ggml-backend-reg_default.o ggml-repack.o $(OBJS_FULL) $(OBJS)
+	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
 
 ggml/src/ggml-vulkan-shaders.cpp:
 ifdef VULKAN_BUILD
 
@@ -191,6 +191,7 @@ and it will install everything required. Alternatively, you can download the abo
 - KoboldCpp code and other files are also under the AGPL v3.0 License unless otherwise stated
 - Llama.cpp source repo is at https://github.com/ggml-org/llama.cpp (MIT)
 - Stable-diffusion.cpp source repo is at https://github.com/leejet/stable-diffusion.cpp (MIT)
+- TTS.cpp source repo is at https://github.com/mmwillet/TTS.cpp (MIT)
 - KoboldCpp source repo is at https://github.com/LostRuins/koboldcpp (AGPL)
 - KoboldAI Lite source repo is at https://github.com/LostRuins/lite.koboldai.net (AGPL)
 - For any further enquiries, contact @concedo on discord, or LostRuins on github.
 
@@ -0,0 +1,24 @@
+The original TTS.cpp is made by mmwillet, repo can be found at https://github.com/mmwillet/TTS.cpp
+KoboldCpp uses a minimal implementation with some files removed.
+
+MIT License
+
+Copyright (c) 2023-2024 The ggml authors
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
@@ -0,0 +1,96 @@
+#include "tts.h"
+#include "ttsargs.h"
+#include "ttscommon.h"
+#include "playback.h"
+#include "vad.h"
+#include "write_file.h"
+#include <thread>
+
+class tts_timing_printer {
+    const int64_t start_us{[] {
+        ggml_time_init();
+        return ggml_time_us();
+    }()};
+public:
+    ~tts_timing_printer() {
+        const int64_t end_us{ggml_time_us()};
+        // Just a simple "total time" for now before adding "load" / "prompt eval" / "eval" from llama_print_timings
+        printf("total time = %.2f ms\n", (end_us - start_us) / 1000.0f);
+    }
+};
+
+int main(int argc, const char ** argv) {
+    const tts_timing_printer _{};
+    float default_temperature = 1.0f;
+    int default_n_threads = std::max((int)std::thread::hardware_concurrency(), 1);
+    int default_top_k = 50;
+    int default_max_tokens = 0;
+    float default_repetition_penalty = 1.0f;
+    float default_top_p = 1.0f;
+    arg_list args;
+    args.add_argument(string_arg("--model-path", "(REQUIRED) The local path of the gguf model file for Parler TTS mini or large v1, Dia, or Kokoro.", "-mp", true));
+    args.add_argument(string_arg("--prompt", "(REQUIRED) The text prompt for which to generate audio in quotation markers.", "-p", true));
+    args.add_argument(string_arg("--save-path", "(OPTIONAL) The path to save the audio output to in a .wav format. Defaults to TTS.cpp.wav", "-sp", false, "TTS.cpp.wav"));
+    args.add_argument(float_arg("--temperature", "The temperature to use when generating outputs. Defaults to 1.0.", "-t", false, &default_temperature));
+    args.add_argument(int_arg("--n-threads", "The number of cpu threads to run generation with. Defaults to hardware concurrency. If hardware concurrency cannot be determined then it defaults to 1.", "-nt", false, &default_n_threads));
+    args.add_argument(int_arg("--topk", "(OPTIONAL) When set to an integer value greater than 0 generation uses nucleus sampling over topk nucleaus size. Defaults to 50.", "-tk", false, &default_top_k));
+    args.add_argument(float_arg("--repetition-penalty", "The by channel repetition penalty to be applied the sampled output of the model. defaults to 1.0.", "-r", false, &default_repetition_penalty));
+    args.add_argument(bool_arg("--use-metal", "(OPTIONAL) Whether to use metal acceleration", "-m"));
+    args.add_argument(bool_arg("--no-cross-attn", "(OPTIONAL) Whether to not include cross attention", "-ca"));
+    args.add_argument(string_arg("--conditional-prompt", "(OPTIONAL) A distinct conditional prompt to use for generating. If none is provided the preencoded prompt is used. '--text-encoder-path' must be set to use conditional generation.", "-cp", false));
+    args.add_argument(string_arg("--text-encoder-path", "(OPTIONAL) The local path of the text encoder gguf model for conditional generaiton.", "-tep", false));
+    args.add_argument(string_arg("--voice", "(OPTIONAL) The voice to use to generate the audio. This is only used for models with voice packs.", "-v", false, "af_alloy"));
+    args.add_argument(bool_arg("--vad", "(OPTIONAL) whether to apply voice inactivity detection (VAD) and strip silence form the end of the output (particularly useful for Parler TTS). By default, no VAD is applied.", "-va"));
+    args.add_argument(string_arg("--espeak-voice-id", "(OPTIONAL) The espeak voice id to use for phonemization. This should only be specified when the correct espeak voice cannot be inferred from the kokoro voice ( see MultiLanguage Configuration in the README for more info).", "-eid", false));
+    args.add_argument(int_arg("--max-tokens", "(OPTIONAL) The max audio tokens or token batches to generate where each represents approximates 11 ms of audio. Only applied to Dia generation. If set to zero as is its default then the default max generation size. Warning values under 15 are not supported.", "-mt", false, &default_max_tokens));
+    args.add_argument(float_arg("--top-p", "(OPTIONAL) the sum of probabilities to sample over. Must be a value between 0.0 and 1.0. Defaults to 1.0.", "-tp", false, &default_top_p));
+    register_play_tts_response_args(args);
+    args.parse(argc, argv);
+    if (args.for_help) {
+        args.help();
+        exit(0);
+    }
+    args.validate();
+
+    std::string conditional_prompt = args.get_string_param("--conditional-prompt");
+    std::string text_encoder_path = args.get_string_param("--text-encoder-path");
+    if (conditional_prompt.size() > 0 && text_encoder_path.size() <= 0) {
+        fprintf(stderr, "The '--text-encoder-path' must be specified when '--condtional-prompt' is passed.\n");
+        exit(1);
+    }
+
+    if (*args.get_float_param("--top-p") > 1.0f || *args.get_float_param("--top-p") <= 0.0f) {
+        fprintf(stderr, "The '--top-p' value must be between 0.0 and 1.0. It was set to '%.6f'.\n", *args.get_float_param("--top-p"));
+        exit(1);
+    }
+
+    generation_configuration * config = new generation_configuration(
+        args.get_string_param("--voice"),
+        *args.get_int_param("--topk"),
+        *args.get_float_param("--temperature"),
+        *args.get_float_param("--repetition-penalty"),
+        !args.get_bool_param("--no-cross-attn"),
+        args.get_string_param("--espeak-voice-id"),
+        *args.get_int_param("--max-tokens"),
+        *args.get_float_param("--top-p"));
+
+    struct tts_runner * runner = runner_from_file(args.get_string_param("--model-path"), *args.get_int_param("--n-threads"), config, !args.get_bool_param("--use-metal"));
+
+    if (conditional_prompt.size() > 0) {
+        update_conditional_prompt(runner, text_encoder_path, conditional_prompt, true);
+    }
+    tts_response data;
+
+    generate(runner, args.get_string_param("--prompt"), &data, config);
+    if (data.n_outputs == 0) {
+        fprintf(stderr, "Got empty response for prompt, '%s'.\n", args.get_string_param("--prompt").c_str());
+        exit(1);
+    }
+    if (args.get_bool_param("--vad")) {
+        apply_energy_voice_inactivity_detection(data, runner->sampling_rate);
+    }
+    if (!play_tts_response(args, data, runner->sampling_rate)) {
+        write_audio_file(data, args.get_string_param("--save-path"), runner->sampling_rate);
+    }
+    return 0;
+}
@@ -0,0 +1,62 @@
+#include <cstdint>
+#include "playback.h"
+
+#ifndef SDL2_INSTALL
+void register_play_tts_response_args(arg_list & args) {
+    // Hide --play
+}
+
+bool play_tts_response(arg_list & args, const tts_response & data, float sample_rate) {
+    return false;
+}
+#else
+#include "SDL.h"
+void register_play_tts_response_args(arg_list & args) {
+    args.add_argument(bool_arg("--play", "(OPTIONAL) Whether to play back the audio immediately instead of saving it to file."));
+}
+
+bool play_tts_response(arg_list & args, const tts_response & data, float sample_rate) {
+    if (!args.get_bool_param("--play")) {
+        return false;
+    }
+
+    if (SDL_Init(SDL_INIT_AUDIO)) {
+        fprintf(stderr, "SDL_INIT failed\n");
+        exit(1);
+    }
+
+    const SDL_AudioSpec desired{
+        .freq = static_cast<int>(sample_rate),
+        .format = AUDIO_F32,
+        .channels = 1,
+        .silence = 0,
+        .padding = 0,
+        .size = static_cast<unsigned>(data.n_outputs),
+        .callback = nullptr,
+        .userdata = nullptr,
+    };
+    const SDL_AudioDeviceID dev = SDL_OpenAudioDevice(nullptr, false, &desired, nullptr, 0);
+    if (!dev) {
+        fprintf(stderr, "SDL_OpenAudioDevice failed\n");
+        exit(1);
+    }
+
+    SDL_PauseAudioDevice(dev, false);
+    fprintf(stdout, "Playing %ld samples of audio\n", data.n_outputs);
+    if (SDL_QueueAudio(dev, data.data, data.n_outputs * sizeof(data.data[0]))) {
+        fprintf(stderr, "SDL_QueueAudio failed\n");
+        exit(1);
+    }
+
+    SDL_Event event;
+    while (SDL_GetQueuedAudioSize(dev)) {
+        if (SDL_PollEvent(&event) && event.type == SDL_QUIT) break;
+        SDL_Delay(100);
+    }
+
+    SDL_CloseAudioDevice(dev);
+    SDL_Quit();
+
+    return true;
+}
+#endif
@@ -0,0 +1,7 @@
+#pragma once
+
+#include "ttsargs.h"
+#include "ttscommon.h"
+
+void register_play_tts_response_args(arg_list & args);
+bool play_tts_response(arg_list & args, const tts_response & data, float sample_rate);
@@ -0,0 +1,68 @@
+#include "vad.h"
+
+float energy(float * chunk, int count) {
+	float en = 0.0f;
+	for (int i = 0; i < count; i++) {
+		en += powf(chunk[i], 2.0f);
+	}
+	return en;
+}
+
+void apply_energy_voice_inactivity_detection(
+	tts_response & data, 
+	float sample_rate, 
+	int ms_per_frame,
+	int frame_threshold,
+	float normalized_energy_threshold,
+	int trailing_silent_frames,
+	int early_cutoff_seconds_threshold,
+	float early_cutoff_energy_threshold) {
+	int samples_per_frame = (int) (ms_per_frame * sample_rate / 1000.0f);
+	int n_frames = (int) (data.n_outputs / samples_per_frame);
+	int early_cuttoff_frames = (int)((early_cutoff_seconds_threshold * 1000) / ms_per_frame);
+
+	// for min-max normalization
+	float max_energy = 0.0f;
+	float min_energy = 0.0f;
+	float * energies = (float *) malloc(n_frames * sizeof(float));
+	int silent_frames = 0;
+
+	// compute the energies and the necessary elements for min-max normalization
+	for (int i = 0; i < n_frames; i++) {
+		float * chunk = data.data + i * samples_per_frame;
+		energies[i] = energy(chunk, samples_per_frame);
+		if (i == 0) {
+			max_energy = energies[i];
+			min_energy = energies[i];
+		} else if (energies[i] > max_energy) {
+			max_energy = energies[i];
+		} else if (energies[i] < min_energy) {
+			min_energy = energies[i];
+		}
+		if (energies[i] <= early_cutoff_energy_threshold) {
+			silent_frames++;
+		} else {
+			silent_frames = 0;
+		}
+		if (silent_frames >= early_cuttoff_frames) {
+			data.n_outputs = (i + trailing_silent_frames - silent_frames) * samples_per_frame;
+			free(energies);
+			return;
+		}
+	}
+
+	int concurrent_silent_frames = 0;
+
+	for (int i = n_frames; i > 0; i--) {
+		float frame_energy = (energies[i-1] - min_energy) / (max_energy - min_energy);
+		if (frame_energy < normalized_energy_threshold) {
+			concurrent_silent_frames++;
+		} else {
+			break;
+		}
+	}
+	if (concurrent_silent_frames >= frame_threshold) {
+		data.n_outputs -= ((concurrent_silent_frames - trailing_silent_frames) * samples_per_frame);
+	}
+	free(energies);
+}
@@ -0,0 +1,21 @@
+#pragma once
+
+#include <math.h>
+#include "ttscommon.h"
+
+float energy(float * chunk, int count);
+
+/*
+ * This function is used to trim trailing silence at the end of audio data within the tts_response struct.
+ * It detects silence by min-max normalizing energy and trimming frames which fall under a relative threshold.
+ */
+void apply_energy_voice_inactivity_detection(
+	tts_response & data,
+	float sample_rate = 44100.0f, // the sample rate of the audio
+	int ms_per_frame = 10, // the audio time per frame
+	int frame_threshold = 20, // the number of trailing empty frames upon which silence is clipped.
+	float normalized_energy_threshold = 0.01f, // the normalized threshold to determine a silent frame
+	int trailing_silent_frames = 5, // the number of frames of silence to allow
+	int early_cutoff_seconds_threshold = 3, // the number of seconds of complete silence before terminating and cutting audio early
+	float early_cutoff_energy_threshold = 0.1 // the energy threshold for treating a frame as silent for early cutoff
+);
@@ -0,0 +1,12 @@
+#include <cstdint>
+#include "write_file.h"
+#include "audio_file.h"
+
+void write_audio_file(const tts_response & data, std::string path, float sample_rate) {
+    fprintf(stdout, "Writing audio file: %s\n", path.c_str());
+    AudioFile<float> file;
+    file.setSampleRate(sample_rate);
+    file.samples[0] = std::vector(data.data, data.data + data.n_outputs);
+    file.save(path, AudioFileFormat::Wave);
+    file.printSummary();
+}
@@ -0,0 +1,5 @@
+#pragma once
+
+#include "ttscommon.h"
+
+void write_audio_file(const tts_response & data, std::string path = "TTS.cpp.wav", float sample_rate = 44100.0f);