Support custom dictionary param for TTS client (#82)

manishaj-nv · rmittal-github · web-flow · commit b32b08fac9b5 · 2024-08-07T12:30:05.000+05:30
* tts: add user_dictionary

* update py clients

* rename variable

* correct argument name

* update description and fix space while joining list

* update common repo SHA

* remove unused imports and fixes

* update common SHA

---------

Co-authored-by: rmittal-github &lt;61574997+rmittal-github@users.noreply.github.com&gt;
Co-authored-by: Rahul Mittal &lt;rmittal@nvidia.com&gt;
diff --git a/common b/common
@@ -1 +1 @@
-Subproject commit 42bf472434054d4e30f06a2452b396c2a4486201
+Subproject commit 988f86f84bf28d028f146ee5669b998ce3442be2
diff --git a/riva/client/tts.py b/riva/client/tts.py
@@ -11,6 +11,11 @@
 from riva.client.proto.riva_audio_pb2 import AudioEncoding
 import wave
 
+def add_custom_dictionary_to_config(req, custom_dictionary):
+    result_list = [f"{key}  {value}" for key, value in custom_dictionary.items()]
+    result_string = ','.join(result_list)
+    req.custom_dictionary = result_string
+
 class SpeechSynthesisService:
     """
     A class for synthesizing speech from text. Provides :meth:`synthesize` which returns entire audio for a text
@@ -38,6 +43,7 @@ def synthesize(
         audio_prompt_encoding: AudioEncoding = AudioEncoding.LINEAR_PCM,
         quality: int = 20,
         future: bool = False,
+        custom_dictionary: Optional[dict] = None,
     ) -> Union[rtts.SynthesizeSpeechResponse, _MultiThreadedRendezvous]:
         """
         Synthesizes an entire audio for text :param:`text`.
@@ -56,6 +62,7 @@ def synthesize(
                                    audio but also takes longer to generate the audio. Ranges between 1-40.
             future (:obj:`bool`, defaults to :obj:`False`): Whether to return an async result instead of usual
                 response. You can get a response by calling ``result()`` method of the future object.
+            custom_dictionary (:obj:`dict`, `optional`): Dictionary with key-value pair containing grapheme and corresponding phoneme
 
         Returns:
             :obj:`Union[riva.client.proto.riva_tts_pb2.SynthesizeSpeechResponse, grpc._channel._MultiThreadedRendezvous]`:
@@ -81,6 +88,8 @@ def synthesize(
             req.zero_shot_data.encoding = audio_prompt_encoding
             req.zero_shot_data.quality = quality
 
+        add_custom_dictionary_to_config(req, custom_dictionary)
+
         func = self.stub.Synthesize.future if future else self.stub.Synthesize
         return func(req, metadata=self.auth.get_auth_metadata())
 
@@ -94,6 +103,7 @@ def synthesize_online(
         audio_prompt_file: Optional[str] = None,
         audio_prompt_encoding: AudioEncoding = AudioEncoding.LINEAR_PCM,
         quality: int = 20,
+        custom_dictionary: Optional[dict] = None,
     ) -> Generator[rtts.SynthesizeSpeechResponse, None, None]:
         """
         Synthesizes and yields output audio chunks for text :param:`text` as the chunks
@@ -111,6 +121,7 @@ def synthesize_online(
             audio_prompt_encoding: (:obj:`AudioEncoding`): Encoding of audio prompt file, e.g. ``AudioEncoding.LINEAR_PCM``.
             quality: (:obj:`int`): This defines the number of times decoder is run. Higher number improves quality of generated
                                    audio but also takes longer to generate the audio. Ranges between 1-40.
+            custom_dictionary (:obj:`dict`, `optional`): Dictionary with key-value pair containing grapheme and corresponding phoneme
 
         Yields:
             :obj:`riva.client.proto.riva_tts_pb2.SynthesizeSpeechResponse`: a response with output. You may find
@@ -138,4 +149,6 @@ def synthesize_online(
             req.zero_shot_data.encoding = audio_prompt_encoding
             req.zero_shot_data.quality = quality
 
+        add_custom_dictionary_to_config(req, custom_dictionary)                   
+
         return self.stub.SynthesizeOnline(req, metadata=self.auth.get_auth_metadata())
diff --git a/scripts/tts/talk.py b/scripts/tts/talk.py
@@ -10,6 +10,20 @@
 import riva.client
 from riva.client.argparse_utils import add_connection_argparse_parameters
 
+def read_file_to_dict(file_path):
+    result_dict = {}
+    with open(file_path, 'r') as file:
+        for line_number, line in enumerate(file, start=1):
+            line = line.strip()
+            try:
+                key, value = line.split('  ', 1)  # Split by double space
+                result_dict[str(key.strip())] = str(value.strip())
+            except ValueError:
+                print(f"Warning: Malformed line {line}")
+                continue
+    if not result_dict:
+        raise ValueError("Error: No valid entries found in the file.")
+    return result_dict
 
 def parse_args() -> argparse.Namespace:
     parser = argparse.ArgumentParser(
@@ -42,6 +56,7 @@ def parse_args() -> argparse.Namespace:
     parser.add_argument(
         "--sample-rate-hz", type=int, default=44100, help="Number of audio frames per second in synthesized audio."
     )
+    parser.add_argument("--custom-dictionary", type=str, help="A file path to a user dictionary with key-value pairs separated by double spaces.")
     parser.add_argument(
         "--stream",
         action="store_true",
@@ -108,12 +123,17 @@ def main() -> None:
             out_f.setsampwidth(sampwidth)
             out_f.setframerate(args.sample_rate_hz)
 
+        custom_dictionary_input = {}
+        if args.custom_dictionary is not None:
+            custom_dictionary_input = read_file_to_dict(args.custom_dictionary)
+
         print("Generating audio for request...")
         start = time.time()
         if args.stream:
             responses = service.synthesize_online(
                 args.text, args.voice, args.language_code, sample_rate_hz=args.sample_rate_hz,
-                audio_prompt_file=args.audio_prompt_file, quality=20 if args.quality is None else args.quality
+                audio_prompt_file=args.audio_prompt_file, quality=20 if args.quality is None else args.quality,
+                custom_dictionary=custom_dictionary_input
             )
             first = True
             for resp in responses:
@@ -128,7 +148,8 @@ def main() -> None:
         else:
             resp = service.synthesize(
                 args.text, args.voice, args.language_code, sample_rate_hz=args.sample_rate_hz,
-                audio_prompt_file=args.audio_prompt_file, quality=20 if args.quality is None else args.quality
+                audio_prompt_file=args.audio_prompt_file, quality=20 if args.quality is None else args.quality,
+                custom_dictionary=custom_dictionary_input
             )
             stop = time.time()
             print(f"Time spent: {(stop - start):.3f}s")