Skip to content

Commit 330aa60

Browse files
rmittal-githubvirajkarandikarsarane22
authored
Merge release/2.16.0 to main (#84)
* add list voices support to tts client (#78) add --list-voices parameter to tts client to query supported voices * Add ASR endpointing stop_threshold_eou parameter (#83) * Exposing the 'stop_historu_eou_th' parameter * updating submodule * Updating param name * Updating help for VAD param * Adding check for stop_threshold_eou * Updating proto branch * updating the submodule --------- Co-authored-by: Viraj Karandikar <[email protected]> Co-authored-by: sarane22 <[email protected]>
1 parent 18d6f8f commit 330aa60

File tree

8 files changed

+73
-39
lines changed

8 files changed

+73
-39
lines changed

riva/client/argparse_utils.py

Lines changed: 11 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -52,7 +52,7 @@ def add_asr_config_argparse_parameters(
5252
"--start-history",
5353
default=-1,
5454
type=int,
55-
help="Value to detect and initiate start of speech utterance",
55+
help="Value (in milliseconds) to detect and initiate start of speech utterance",
5656
)
5757
parser.add_argument(
5858
"--start-threshold",
@@ -64,19 +64,25 @@ def add_asr_config_argparse_parameters(
6464
"--stop-history",
6565
default=-1,
6666
type=int,
67-
help="Value to reset the endpoint detection history",
67+
help="Value (in milliseconds) to detect end of utterance and reset decoder",
68+
)
69+
parser.add_argument(
70+
"--stop-threshold",
71+
default=-1.0,
72+
type=float,
73+
help="Threshold value for detecting the end of speech utterance",
6874
)
6975
parser.add_argument(
7076
"--stop-history-eou",
7177
default=-1,
7278
type=int,
73-
help="Value to determine the response history for endpoint detection",
79+
help="Value (in milliseconds) to detect end of utterance for the 1st pass and generate an intermediate final transcript",
7480
)
7581
parser.add_argument(
76-
"--stop-threshold",
82+
"--stop-threshold-eou",
7783
default=-1.0,
7884
type=float,
79-
help="Threshold value for detecting the end of speech utterance",
85+
help="Threshold value for likelihood of blanks before detecting end of utterance",
8086
)
8187
return parser
8288

riva/client/asr.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -130,8 +130,9 @@ def add_endpoint_parameters_to_config(
130130
stop_history: int,
131131
stop_history_eou: int,
132132
stop_threshold: float,
133+
stop_threshold_eou: float,
133134
) -> None:
134-
if not (start_history > 0 or start_threshold > 0 or stop_history > 0 or stop_history_eou > 0 or stop_threshold > 0):
135+
if not (start_history > 0 or start_threshold > 0 or stop_history > 0 or stop_history_eou > 0 or stop_threshold > 0 or stop_threshold_eou > 0):
135136
return
136137

137138
inner_config: rasr.RecognitionConfig = config if isinstance(config, rasr.RecognitionConfig) else config.config
@@ -146,6 +147,8 @@ def add_endpoint_parameters_to_config(
146147
endpointing_config.stop_history_eou = stop_history_eou
147148
if stop_threshold > 0:
148149
endpointing_config.stop_threshold = stop_threshold
150+
if stop_threshold_eou > 0:
151+
endpointing_config.stop_threshold_eou = stop_threshold_eou
149152
inner_config.endpointing_config.CopyFrom(endpointing_config)
150153

151154

scripts/asr/riva_streaming_asr_client.py

Lines changed: 7 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -64,12 +64,13 @@ def streaming_transcription_worker(
6464
interim_results=True,
6565
)
6666
riva.client.add_endpoint_parameters_to_config(
67-
config,
68-
args.start_history,
69-
args.start_threshold,
70-
args.stop_history,
71-
args.stop_history_eou,
72-
args.stop_threshold
67+
config,
68+
args.start_history,
69+
args.start_threshold,
70+
args.stop_history,
71+
args.stop_history_eou,
72+
args.stop_threshold,
73+
args.stop_threshold_eou
7374
)
7475
riva.client.add_word_boosting_to_config(config, args.boosted_lm_words, args.boosted_lm_score)
7576
for _ in range(args.num_iterations):

scripts/asr/transcribe_file.py

Lines changed: 7 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -80,12 +80,13 @@ def main() -> None:
8080
)
8181
riva.client.add_word_boosting_to_config(config, args.boosted_lm_words, args.boosted_lm_score)
8282
riva.client.add_endpoint_parameters_to_config(
83-
config,
84-
args.start_history,
85-
args.start_threshold,
86-
args.stop_history,
87-
args.stop_history_eou,
88-
args.stop_threshold
83+
config,
84+
args.start_history,
85+
args.start_threshold,
86+
args.stop_history,
87+
args.stop_history_eou,
88+
args.stop_threshold,
89+
args.stop_threshold_eou
8990
)
9091
sound_callback = None
9192
try:

scripts/asr/transcribe_file_offline.py

Lines changed: 7 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -39,12 +39,13 @@ def main() -> None:
3939
riva.client.add_word_boosting_to_config(config, args.boosted_lm_words, args.boosted_lm_score)
4040
riva.client.add_speaker_diarization_to_config(config, args.speaker_diarization)
4141
riva.client.add_endpoint_parameters_to_config(
42-
config,
43-
args.start_history,
44-
args.start_threshold,
45-
args.stop_history,
46-
args.stop_history_eou,
47-
args.stop_threshold
42+
config,
43+
args.start_history,
44+
args.start_threshold,
45+
args.stop_history,
46+
args.stop_history_eou,
47+
args.stop_threshold,
48+
args.stop_threshold_eou
4849
)
4950
with args.input_file.open('rb') as fh:
5051
data = fh.read()

scripts/asr/transcribe_mic.py

Lines changed: 7 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -58,12 +58,13 @@ def main() -> None:
5858
)
5959
riva.client.add_word_boosting_to_config(config, args.boosted_lm_words, args.boosted_lm_score)
6060
riva.client.add_endpoint_parameters_to_config(
61-
config,
62-
args.start_history,
63-
args.start_threshold,
64-
args.stop_history,
65-
args.stop_history_eou,
66-
args.stop_threshold
61+
config,
62+
args.start_history,
63+
args.start_threshold,
64+
args.stop_history,
65+
args.stop_history_eou,
66+
args.stop_threshold,
67+
args.stop_threshold_eou
6768
)
6869
with riva.client.audio_io.MicrophoneStream(
6970
args.sample_rate_hz,

scripts/tts/talk.py

Lines changed: 29 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44
import argparse
55
import time
66
import wave
7+
import json
78
from pathlib import Path
89

910
import riva.client
@@ -21,12 +22,12 @@ def parse_args() -> argparse.Namespace:
2122
help="A voice name to use. If this parameter is missing, then the server will try a first available model "
2223
"based on parameter `--language-code`.",
2324
)
24-
parser.add_argument("--text", type=str, required=True, help="Text input to synthesize.")
25+
parser.add_argument("--text", type=str, required=False, help="Text input to synthesize.")
2526
parser.add_argument(
2627
"--audio_prompt_file",
2728
type=Path,
2829
help="An input audio prompt (.wav) file for zero shot model. This is required to do zero shot inferencing.")
29-
parser.add_argument("-o", "--output", type=Path, help="Output file .wav file to write synthesized audio.")
30+
parser.add_argument("-o", "--output", type=Path, default="output.wav", help="Output file .wav file to write synthesized audio.")
3031
parser.add_argument("--quality", type=int, help="Number of times decoder should be run on the output audio. A higher number improves quality of the produced output but introduces latencies.")
3132
parser.add_argument(
3233
"--play-audio",
@@ -35,6 +36,7 @@ def parse_args() -> argparse.Namespace:
3536
"then the default output audio device will be used.",
3637
)
3738
parser.add_argument("--list-devices", action="store_true", help="List output audio devices indices.")
39+
parser.add_argument("--list-voices", action="store_true", help="List available voices.")
3840
parser.add_argument("--output-device", type=int, help="Output device to use.")
3941
parser.add_argument("--language-code", default='en-US', help="A language of input text.")
4042
parser.add_argument(
@@ -49,11 +51,6 @@ def parse_args() -> argparse.Namespace:
4951
)
5052
parser = add_connection_argparse_parameters(parser)
5153
args = parser.parse_args()
52-
if args.output is None and not args.play_audio and args.output_device is None and not args.list_devices:
53-
parser.error(
54-
f"You have to provide at least one of arguments: `--play-audio`, `--output-device`, `--output`, "
55-
f"`--list-devices`."
56-
)
5754
if args.output is not None:
5855
args.output = args.output.expanduser()
5956
if args.list_devices or args.output_device or args.play_audio:
@@ -65,12 +62,36 @@ def main() -> None:
6562
args = parse_args()
6663
if args.list_devices:
6764
riva.client.audio_io.list_output_devices()
68-
return
65+
6966
auth = riva.client.Auth(args.ssl_cert, args.use_ssl, args.server, args.metadata)
7067
service = riva.client.SpeechSynthesisService(auth)
7168
nchannels = 1
7269
sampwidth = 2
7370
sound_stream, out_f = None, None
71+
72+
if args.list_voices:
73+
config_response = service.stub.GetRivaSynthesisConfig(
74+
riva.client.proto.riva_tts_pb2.RivaSynthesisConfigRequest()
75+
)
76+
tts_models = dict()
77+
for model_config in config_response.model_config:
78+
language_code = model_config.parameters['language_code']
79+
voice_name = model_config.parameters['voice_name']
80+
subvoices = [voice.split(':')[0] for voice in model_config.parameters['subvoices'].split(',')]
81+
full_voice_names = [voice_name + "." + subvoice for subvoice in subvoices]
82+
83+
if language_code in tts_models:
84+
tts_models[language_code]['voices'].extend(full_voice_names)
85+
else:
86+
tts_models[language_code] = {"voices": full_voice_names}
87+
88+
tts_models = dict(sorted(tts_models.items()))
89+
print(json.dumps(tts_models, indent=4))
90+
91+
if not args.text:
92+
print("No input text provided")
93+
return
94+
7495
try:
7596
if args.output_device is not None or args.play_audio:
7697
sound_stream = riva.client.audio_io.SoundCallBack(

0 commit comments

Comments
 (0)