9
9
import riva .client .proto .riva_tts_pb2_grpc as rtts_srv
10
10
from riva .client import Auth
11
11
from riva .client .proto .riva_audio_pb2 import AudioEncoding
12
-
12
+ import wave
13
13
14
14
class SpeechSynthesisService :
15
15
"""
@@ -34,20 +34,27 @@ def synthesize(
34
34
language_code : str = 'en-US' ,
35
35
encoding : AudioEncoding = AudioEncoding .LINEAR_PCM ,
36
36
sample_rate_hz : int = 44100 ,
37
+ audio_prompt_file : Optional [str ] = None ,
38
+ audio_prompt_encoding : AudioEncoding = AudioEncoding .LINEAR_PCM ,
39
+ quality : int = 20 ,
37
40
future : bool = False ,
38
41
) -> Union [rtts .SynthesizeSpeechResponse , _MultiThreadedRendezvous ]:
39
42
"""
40
43
Synthesizes an entire audio for text :param:`text`.
41
44
42
45
Args:
43
- text (:obj:`str`): an input text.
44
- voice_name (:obj:`str`, `optional`): a name of the voice, e.g. ``"English-US-Female-1"``. You may find
46
+ text (:obj:`str`): An input text.
47
+ voice_name (:obj:`str`, `optional`): A name of the voice, e.g. ``"English-US-Female-1"``. You may find
45
48
available voices in server logs or in server model directory. If this parameter is :obj:`None`, then
46
49
a server will select the first available model with correct :param:`language_code` value.
47
50
language_code (:obj:`str`): a language to use.
48
- encoding (:obj:`AudioEncoding`): an output audio encoding, e.g. ``AudioEncoding.LINEAR_PCM``.
49
- sample_rate_hz (:obj:`int`): number of frames per second in output audio.
50
- future (:obj:`bool`, defaults to :obj:`False`): whether to return an async result instead of usual
51
+ encoding (:obj:`AudioEncoding`): An output audio encoding, e.g. ``AudioEncoding.LINEAR_PCM``.
52
+ sample_rate_hz (:obj:`int`): Number of frames per second in output audio.
53
+ audio_prompt_file (:obj:`str`): An audio prompt file location for zero shot model.
54
+ audio_prompt_encoding: (:obj:`AudioEncoding`): Encoding of audio prompt file, e.g. ``AudioEncoding.LINEAR_PCM``.
55
+ quality: (:obj:`int`): This defines the number of times decoder is run. Higher number improves quality of generated
56
+ audio but also takes longer to generate the audio. Ranges between 1-40.
57
+ future (:obj:`bool`, defaults to :obj:`False`): Whether to return an async result instead of usual
51
58
response. You can get a response by calling ``result()`` method of the future object.
52
59
53
60
Returns:
@@ -64,6 +71,16 @@ def synthesize(
64
71
)
65
72
if voice_name is not None :
66
73
req .voice_name = voice_name
74
+ if audio_prompt_file is not None :
75
+ with wave .open (str (audio_prompt_file ), 'rb' ) as wf :
76
+ rate = wf .getframerate ()
77
+ req .zero_shot_data .sample_rate = rate
78
+ with audio_prompt_file .open ('rb' ) as wav_f :
79
+ audio_data = wav_f .read ()
80
+ req .zero_shot_data .audio_prompt = audio_data
81
+ req .zero_shot_data .encoding = audio_prompt_encoding
82
+ req .zero_shot_data .quality = quality
83
+
67
84
func = self .stub .Synthesize .future if future else self .stub .Synthesize
68
85
return func (req , metadata = self .auth .get_auth_metadata ())
69
86
@@ -74,19 +91,26 @@ def synthesize_online(
74
91
language_code : str = 'en-US' ,
75
92
encoding : AudioEncoding = AudioEncoding .LINEAR_PCM ,
76
93
sample_rate_hz : int = 44100 ,
94
+ audio_prompt_file : Optional [str ] = None ,
95
+ audio_prompt_encoding : AudioEncoding = AudioEncoding .LINEAR_PCM ,
96
+ quality : int = 20 ,
77
97
) -> Generator [rtts .SynthesizeSpeechResponse , None , None ]:
78
98
"""
79
99
Synthesizes and yields output audio chunks for text :param:`text` as the chunks
80
100
becoming available.
81
101
82
102
Args:
83
- text (:obj:`str`): an input text.
84
- voice_name (:obj:`str`, `optional`): a name of the voice, e.g. ``"English-US-Female-1"``. You may find
103
+ text (:obj:`str`): An input text.
104
+ voice_name (:obj:`str`, `optional`): A name of the voice, e.g. ``"English-US-Female-1"``. You may find
85
105
available voices in server logs or in server model directory. If this parameter is :obj:`None`, then
86
106
a server will select the first available model with correct :param:`language_code` value.
87
- language_code (:obj:`str`): a language to use.
88
- encoding (:obj:`AudioEncoding`): an output audio encoding, e.g. ``AudioEncoding.LINEAR_PCM``.
89
- sample_rate_hz (:obj:`int`): number of frames per second in output audio.
107
+ language_code (:obj:`str`): A language to use.
108
+ encoding (:obj:`AudioEncoding`): An output audio encoding, e.g. ``AudioEncoding.LINEAR_PCM``.
109
+ sample_rate_hz (:obj:`int`): Number of frames per second in output audio.
110
+ audio_prompt_file (:obj:`str`): An audio prompt file location for zero shot model.
111
+ audio_prompt_encoding: (:obj:`AudioEncoding`): Encoding of audio prompt file, e.g. ``AudioEncoding.LINEAR_PCM``.
112
+ quality: (:obj:`int`): This defines the number of times decoder is run. Higher number improves quality of generated
113
+ audio but also takes longer to generate the audio. Ranges between 1-40.
90
114
91
115
Yields:
92
116
:obj:`riva.client.proto.riva_tts_pb2.SynthesizeSpeechResponse`: a response with output. You may find
@@ -103,4 +127,15 @@ def synthesize_online(
103
127
)
104
128
if voice_name is not None :
105
129
req .voice_name = voice_name
130
+
131
+ if audio_prompt_file is not None :
132
+ with wave .open (str (audio_prompt_file ), 'rb' ) as wf :
133
+ rate = wf .getframerate ()
134
+ req .zero_shot_data .sample_rate = rate
135
+ with audio_prompt_file .open ('rb' ) as wav_f :
136
+ audio_data = wav_f .read ()
137
+ req .zero_shot_data .audio_prompt = audio_data
138
+ req .zero_shot_data .encoding = audio_prompt_encoding
139
+ req .zero_shot_data .quality = quality
140
+
106
141
return self .stub .SynthesizeOnline (req , metadata = self .auth .get_auth_metadata ())
0 commit comments