Feat/voice selection (#6)

JarbasAl · web-flow · commit e04d7b46f6e9 · 2022-02-28T22:11:49.000Z
* add nancy voice

* add ljspeech voice

* add voice support

authored-by: jarbasai &lt;jarbasai@mailfence.com&gt;
diff --git a/ovos_tts_plugin_mimic2/__init__.py b/ovos_tts_plugin_mimic2/__init__.py
@@ -12,6 +12,7 @@
 #
 import base64
 import math
+import random
 import re
 
 import requests
@@ -21,7 +22,7 @@
 
 
 class Mimic2TTSPlugin(TTS):
-    """Interface to Catotron TTS."""
+    """Interface to Mimic2 TTS."""
     # Heuristic value, caps character length of a chunk of text
     # to be spoken as a work around for current Tacotron implementation limits.
     max_sentence_size = 170
@@ -30,7 +31,25 @@ def __init__(self, lang="en-us", config=None):
         config = config or {}
         super(Mimic2TTSPlugin, self).__init__(lang, config,
                                               Mimic2TTSValidator(self), 'wav')
-        self.url = config.get("url", "https://mimic-api.mycroft.ai/synthesize")
+        self.voice = self.voice.lower()
+        self._visemes = False
+        self.cache.persist = True  # save synths to avoid repeat queries
+        if self.config.get("url"):  # self hosted
+            self.url = self.config["url"]
+            # TODO disable cache to avoid filename conflicts with other voices
+            if not self.voice or self.voice == "default":
+                self.voice = f"selfhosted{random.randint(0, 9999999)}"
+                self.cache.persist = False
+        elif self.voice == "kusal" or self.voice == "default":
+            self.url = "https://mimic-api.mycroft.ai/synthesize"
+            self._visemes = True
+        elif self.voice == "nancy":
+            self.url = "https://nancy.2022.us/synthesize"
+        elif self.voice == "ljspeech":
+            self.url = "https://ljspeech.2022.us/synthesize"
+        else:
+            self.voice = "kusal"
+            self.url = "https://mimic-api.mycroft.ai/synthesize"
 
     def get_tts(self, sentence, wav_file, lang=None):
         """Fetch tts audio using tacotron endpoint.
@@ -41,13 +60,17 @@ def get_tts(self, sentence, wav_file, lang=None):
         Returns:
             Tuple ((str) written file, None)
         """
-        params = {"text": sentence, "visimes": True}
+        params = {"text": sentence, "visimes": self._visemes}
         r = requests.get(self.url, params=params)
         if not r.ok:
             raise RemoteTTSException(f"Mimic2 server error: {r.reason}")
-        results = r.json()
-        audio_data = base64.b64decode(results['audio_base64'])
-        phonemes = results['visimes']
+        if not self._visemes:
+            audio_data = r.content
+            phonemes = None
+        else:
+            results = r.json()
+            audio_data = base64.b64decode(results['audio_base64'])
+            phonemes = results['visimes']
         with open(wav_file, "wb") as f:
             f.write(audio_data)
         return (wav_file, phonemes)  # No phonemes
diff --git a/readme.md b/readme.md
@@ -12,15 +12,20 @@ OVOS TTS plugin for [Mimic2](https://github.com/MycroftAI/mimic2)
   "tts": {
     "module": "ovos-tts-plugin-mimic2",
     "ovos-tts-plugin-mimic2": {
-        "url": "https://mimic-api.mycroft.ai/synthesize"
+        "voice": "kusal"
     }
   }
  
 ```
 
 ### Voices
 
-You can self host models trained on [NancyCorpus](http://www.cstr.ed.ac.uk/projects/blizzard/2011/lessac_blizzard2011/) by [@MXGray](https://github.com/MXGray) and [LJ-Speech-Dataset](https://keithito.com/LJ-Speech-Dataset) by [keithito](https://github.com/keithito/tacotron)
+Available Voices:
+- Kusal - Mycroft AI official voice, hosted by Mycroft
+- Nancy - trained on [Nancy Corpus](http://www.cstr.ed.ac.uk/projects/blizzard/2011/lessac_blizzard2011/) by [@MXGray](https://github.com/MXGray, hosted by Neon
+- ljspeech - trained on [LJ-Speech-Dataset](https://keithito.com/LJ-Speech-Dataset) by [keithito](https://github.com/keithito/tacotron), hosted by Neon
+
+### Self Hosting
 
 The Kusal voice model is not provided by MycroftAI and can not be self hosted
 
@@ -36,6 +41,19 @@ docker build -f nancy.Dockerfile -t mimic2-nancy
 docker build -f ljspeech.Dockerfile -t mimic2-ljspeech
 ```
 
-run the container and set url in config `http://0.0.0.0:9000/synthesize`
+run the container 
 
 `docker run --rm -p 9000:9000 mimic2-nancy`
+
+set url and voice in config, voice is used for local caching of files by ovos plugins
+
+```json
+  "tts": {
+    "module": "ovos-tts-plugin-mimic2",
+    "ovos-tts-plugin-mimic2": {
+        "url": "http://0.0.0.0:9000/synthesize",
+        "voice": "nancy"
+    }
+  }
+ 
+```
diff --git a/test/unittests/test_something.py b/test/unittests/test_something.py
@@ -5,13 +5,10 @@
 
 
 class TestTTS(unittest.TestCase):
-    @classmethod
-    def setUpClass(self):
-        self.mimic = Mimic2TTSPlugin()
-
-    def test_something(self):
+    def test_kusal(self):
         path = "/tmp/hello_kusal.wav"
-        audio, phonemes = self.mimic.get_tts("hello world", path)
+        mimic = Mimic2TTSPlugin()
+        audio, phonemes = mimic.get_tts("hello world", path)
         self.assertEqual(audio, path)
         self.assertEqual(phonemes,
                          [['HH', '0.0775'],
@@ -22,3 +19,17 @@ def test_something(self):
                           ['ER', '0.5580'],
                           ['L', '0.6820'],
                           ['D', '0.8060']])
+
+    def test_nancy(self):
+        path = "/tmp/hello_nancy.wav"
+        mimic = Mimic2TTSPlugin(config={"voice": "nancy"})
+        audio, phonemes = mimic.get_tts("hello world", path)
+        self.assertEqual(audio, path)
+        self.assertEqual(phonemes, None)
+
+    def test_ljspeech(self):
+        path = "/tmp/hello_ljspeech.wav"
+        mimic = Mimic2TTSPlugin(config={"voice": "ljspeech"})
+        audio, phonemes = mimic.get_tts("hello world", path)
+        self.assertEqual(audio, path)
+        self.assertEqual(phonemes, None)