can't resolve the clicking

LostRuins · LostRuins · commit 75c919cfd4df · 2025-08-26T17:55:03.000+08:00
diff --git a/kokoro_ipa.embd b/kokoro_ipa.embd
@@ -19951,6 +19951,7 @@ excursions,ɪkskˈɜɹʒənz
 excusable,ɪkskjˈuzəbᵊl
 excused,ɪkskjˈuzd
 excusing,ɪkskjˈuzɪŋ
+excuses,ɪkskjˈuzᵻz
 exec,ɛɡzˈɛk
 execrable,ˈɛksəkɹəbᵊl
 execration,ˌɛksəkɹˈAʃən
@@ -39067,16 +39068,16 @@ organelles,ˌɔɹɡənˈɛlz
 organically,ɔɹɡˈænəkᵊli
 organic,ɔɹɡˈænɪk
 organics,ɔɹɡˈænɪks
-organisationally,ˌɔɹɡənəzˈAʃənᵊli
-organisational,ˌɔɹɡənəzˈAʃənᵊl
-organisation,ˌɔɹɡənəzˈAʃən
-organisations,ˌɔɹɡənəzˈAʃənz
-organised,ˈɔɹɡənˌIzd
-organise,ˈɔɹɡənˌIz
-organiser,ˈɔɹɡənˌIzəɹ
-organisers,ˈɔɹɡənˌIzəɹz
-organises,ˈɔɹɡənˌIzᵻz
-organising,ˈɔɹɡənˌIzɪŋ
+organizationally,ˌɔɹɡənəzˈAʃənᵊli
+organizational,ˌɔɹɡənəzˈAʃənᵊl
+organization,ˌɔɹɡənəzˈAʃən
+organizations,ˌɔɹɡənəzˈAʃənz
+organized,ˈɔɹɡənˌIzd
+organize,ˈɔɹɡənˌIz
+organizer,ˈɔɹɡənˌIzəɹ
+organizers,ˈɔɹɡənˌIzəɹz
+organizes,ˈɔɹɡənˌIzᵻz
+organizing,ˈɔɹɡənˌIzɪŋ
 organism,ˈɔɹɡənˌɪzəm
 organisms,ˈɔɹɡənˌɪzəmz
 organist,ˈɔɹɡənɪst
diff --git a/otherarch/tts_adapter.cpp b/otherarch/tts_adapter.cpp
@@ -64,6 +64,85 @@ struct wav_header {
     uint32_t data_size;
 };
 
+// #include <vector>
+// #include <cstdio>
+// #include <cmath>
+
+// static void audio_post_clean(std::vector<float>& data) { // detect clicks
+//     const float silenceThreshold = 1e-5f;
+//     const float noiseThreshold   = 1e-3f;
+//     const size_t minSilence      = 100;   // samples
+//     const size_t noiseSpan       = 150;   // samples
+//     const size_t minSilence2      = 100;   // samples
+
+//     size_t len = data.size();
+
+//     int silencecounterA = 0;
+//     int noisecounterA   = 0;
+//     int silencecounterB = 0;
+//     int state = 0; // 0 = finding first silence, 1 = measuring noise, 2 = finding second silence
+
+//     size_t noiseStart = 0;
+
+//     for (size_t i = 0; i < len; ++i) {
+//         float sample = std::fabs(data[i]);
+
+//         if (state == 0) { // finding first silence
+//             if (sample < silenceThreshold) {
+//                 silencecounterA++;
+//             } else {
+//                 if (silencecounterA >= minSilence) {
+//                     state = 1;
+//                     noisecounterA = 1;
+//                     noiseStart = i;
+//                 } else {
+//                     silencecounterA = 0;
+//                     noisecounterA = 0;
+//                     silencecounterB = 0;
+//                 }
+//             }
+//         }
+//         if (state == 1) { // measuring noise span
+//             noisecounterA++;
+//             if(sample>noiseThreshold)
+//             {
+//                 state = 0;
+//                 silencecounterA = 0;
+//                 noisecounterA = 0;
+//                 silencecounterB = 0;
+//             }
+//             else if(noisecounterA>noiseSpan)
+//             {
+//                 state = 2;
+//             }
+//         }
+//         if (state == 2) { // finding second silence
+//             if (sample < silenceThreshold) {
+//                 silencecounterB++;
+//                 if (silencecounterB >= minSilence2) {
+//                     // full click detected
+//                     size_t noiseend = noiseStart + noisecounterA - 1;
+//                     //printf("Click detected from %zu to %zu\n", noiseStart, noiseend);
+//                     for(size_t j=noiseStart;j<noiseend;++j)
+//                     {
+//                         data[j] *= 0.01f; //greatly suppress noise
+//                     }
+//                     // reset to search again
+//                     state = 0;
+//                     silencecounterA = 0;
+//                     noisecounterA = 0;
+//                     silencecounterB = 0;
+//                 }
+//             } else {
+//                 state = 0;
+//                 silencecounterA = 0;
+//                 noisecounterA = 0;
+//                 silencecounterB = 0;
+//             }
+//         }
+//     }
+// }
+
 static std::string save_wav16_base64(const std::vector<float> &data, int sample_rate) {
     std::ostringstream oss;
     wav_header header;
@@ -740,6 +819,7 @@ static tts_generation_outputs ttstype_generate_ttscpp(const tts_generation_input
         ttstime = timer_check();
         printf("\nTTS Generated audio in %.2fs.\n",ttstime);
         std::vector<float> wavdat = std::vector(response_data.data, response_data.data + response_data.n_outputs);
+        //audio_post_clean(wavdat);
         last_generated_audio = save_wav16_base64(wavdat, ttscpp_runner->sampling_rate);
         output.data = last_generated_audio.c_str();
         output.status = 1;
diff --git a/otherarch/ttscpp/include/phonemizer.h b/otherarch/ttscpp/include/phonemizer.h
@@ -128,7 +128,7 @@ static const std::map<const char, std::string> LETTER_PHONEMES = {
 	{'d', "dˈiː"},
 	{'e', "ˈiː"},
 	{'f', "ˈɛf"},
-	{'j', "dʒˈeɪ"},
+	{'g', "dʒˈi"},
 	{'h', "ˈeɪtʃ"},
 	{'i', "ˈaɪ"},
 	{'j', "dʒˈeɪ"},
diff --git a/otherarch/ttscpp/src/kokoro_model.cpp b/otherarch/ttscpp/src/kokoro_model.cpp
@@ -1426,11 +1426,13 @@ int kokoro_runner::generate(std::string prompt, struct tts_response * response,
     prompt = replace_any(prompt, ";:", "--");
     prompt = replace_any(prompt, "\n", "--");
 	kokoro_str_replace_all(prompt,"’","'");
+	kokoro_str_replace_all(prompt,"Mr. ","Mister ");
 	prompt = std::regex_replace(prompt, std::regex("(\\w)([.!?]) "), "$1$2, ");
 	kokoro_str_replace_all(prompt," - "," -- ");
 	kokoro_str_replace_all(prompt,"he's ","he is ");
 	kokoro_str_replace_all(prompt,"'s ","s ");
 	kokoro_str_replace_all(prompt,"n't ","nt ");
+	kokoro_str_replace_all(prompt,"*"," ");
   	std::string phonemized_prompt = phmzr->text_to_phonemes(prompt);
 	// printf("\nRESULT: %s\n",phonemized_prompt.c_str());
 
diff --git a/otherarch/ttscpp/src/phonemizer.cpp b/otherarch/ttscpp/src/phonemizer.cpp
@@ -893,9 +893,11 @@ bool phonemizer::process_word(corpus* text, std::string* output, std::string wor
 			text->size_pop(word.size()+unaccented_size_difference);
 			return true;
 		}
-	} else if (can_be_roman_numeral(word) && is_all_upper(word) && small_english_words.find(to_lower(word)) == small_english_words.end() && handle_roman_numeral(text, output, flags)) {
-		return true;
-	} else if (is_acronym_like(text, word, flags)) {
+	}
+	// else if (can_be_roman_numeral(word) && is_all_upper(word) && small_english_words.find(to_lower(word)) == small_english_words.end() && handle_roman_numeral(text, output, flags)) {
+	// 	return true;
+	// }
+	else if (is_acronym_like(text, word, flags)) {
 		return handle_acronym(text, word, output, flags);
 	} else if (word.find(".") < word.length()) {
 		bool part_has_accent = false;