Skip to content

Commit 55755ae

Browse files
committed
cosm (ruff)
1 parent 489ba3e commit 55755ae

File tree

1 file changed

+49
-30
lines changed

1 file changed

+49
-30
lines changed

ssak/utils/tts.py

100755100644
Lines changed: 49 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -1,20 +1,15 @@
1-
import parler_tts
1+
import random
22

3+
import nltk
4+
import parler_tts
35
import torch
46
import torchaudio
5-
6-
import random
77
import transformers
8-
9-
import nltk
108
from nltk.tokenize import sent_tokenize
119

1210
# Download NLTK's punkt tokenizer data if not already downloaded
1311
global _nltk_initialized
14-
try:
15-
_nltk_initialized
16-
except NameError:
17-
_nltk_initialized = False
12+
_nltk_initialized = False
1813

1914
# Function to split text into chunks using sentence tokenization
2015
def nltk_chunk_text(text):
@@ -25,15 +20,32 @@ def nltk_chunk_text(text):
2520
return sent_tokenize(text)
2621

2722
_tts_speaker_prompts = [
28-
"A female speaker delivers an expressive and animated speech with a very high-pitch voice. The recording is slightly noisy but of good quality, as her voice comes across as very close-sounding.",
29-
"A female speaker delivers her speech with a slightly expressive and animated tone, her voice ringing clearly and undistorted in the recording. The pitch of her voice is very high, adding a sense of urgency and excitement.",
30-
"A female speaks with a slightly expressive and animated tone in a recording that sounds quite clear and close up. There is only a mild amount of background noise present, and her voice has a moderate pitch. Her speech pace is steady, neither slow nor particularly fast.",
31-
"A female speaker delivers her speech in a recording that sounds clear and close up. Her voice is slightly expressive and animated, with a moderate pitch. The recording has a mild amount of background noise, but her voice is still easily understood.",
32-
"In a somewhat confined space, a female speaker delivers a talk that is slightly expressive and animated, despite some background noise. Her voice has a low-pitch tone.",
33-
"A male voice speaks in a monotone tone with a slightly low-pitch, delivering his words at a moderate speed. The recording offers almost no noise, resulting in a very clear and high-quality listen. The close-up microphone captures every detail of his speech.",
34-
"A man speaks with a monotone tone and a slightly low-pitch, delivering his words at a moderate speed. The recording captures his speech very clearly and distinctly, with little to no background noise. The listener feels as if they're almost sharing the same space with the speaker.",
35-
"A male speaker delivers his words with a very monotone and slightly faster than average pace. His voice is very clear, making every word distinct, while it also has a slightly low-pitch tone. The recording quality is excellent, with no apparent reverberation or background noise.",
36-
"A male speaker delivers his words in a very monotone and slightly low-pitched voice, maintaining a moderate speed. The recording is of very high quality, with minimum noise and a very close-sounding reverberation that suggests a quiet and enclosed environment.",
23+
"A female speaker delivers an expressive and animated speech with a very high-pitch voice. "
24+
"The recording is slightly noisy but of good quality, as her voice comes across as very close-sounding.",
25+
"A female speaker delivers her speech with a slightly expressive and animated tone, "
26+
"her voice ringing clearly and undistorted in the recording. "
27+
"The pitch of her voice is very high, adding a sense of urgency and excitement.",
28+
"A female speaks with a slightly expressive and animated tone in a recording that sounds quite clear and close up. "
29+
"There is only a mild amount of background noise present, and her voice has a moderate pitch. "
30+
"Her speech pace is steady, neither slow nor particularly fast.",
31+
"A female speaker delivers her speech in a recording that sounds clear and close up. "
32+
"Her voice is slightly expressive and animated, with a moderate pitch. "
33+
"The recording has a mild amount of background noise, but her voice is still easily understood.",
34+
"In a somewhat confined space, a female speaker delivers a talk that is slightly expressive and animated, "
35+
"despite some background noise. "
36+
"Her voice has a low-pitch tone.",
37+
"A male voice speaks in a monotone tone with a slightly low-pitch, delivering his words at a moderate speed. "
38+
"The recording offers almost no noise, resulting in a very clear and high-quality listen. "
39+
"The close-up microphone captures every detail of his speech.",
40+
"A man speaks with a monotone tone and a slightly low-pitch, delivering his words at a moderate speed. "
41+
"The recording captures his speech very clearly and distinctly, with little to no background noise. "
42+
"The listener feels as if they're almost sharing the same space with the speaker.",
43+
"A male speaker delivers his words with a very monotone and slightly faster than average pace. "
44+
"His voice is very clear, making every word distinct, while it also has a slightly low-pitch tone. "
45+
"The recording quality is excellent, with no apparent reverberation or background noise.",
46+
"A male speaker delivers his words in a very monotone and slightly low-pitched voice, "
47+
"maintaining a moderate speed. The recording is of very high quality, with minimum noise "
48+
"and a very close-sounding reverberation that suggests a quiet and enclosed environment.",
3749
]
3850

3951

@@ -46,10 +58,10 @@ def text_to_speech(
4658
prompt=None,
4759
device=None,
4860
model_name="parler-tts/parler-tts-mini-multilingual-v1.1",
49-
sample_rate=16_000,
61+
sampling_rate=16_000,
5062
):
5163
global _tts_models
52-
64+
5365
# Set up device
5466
if device is None:
5567
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
@@ -58,18 +70,24 @@ def text_to_speech(
5870

5971
if prompt is None:
6072
prompt = random.choice(_tts_speaker_prompts)
73+
elif isinstance(prompt, list):
74+
prompt = random.choice(prompt)
75+
elif isinstance(prompt, str):
76+
pass
77+
else:
78+
raise ValueError("Prompt must be a string or a list of strings")
6179

6280
# Load processor and model from Hugging Face, with caching in (V)RAM
6381
if model_name not in _tts_models:
6482

6583
model = parler_tts.ParlerTTSForConditionalGeneration.from_pretrained(model_name).to(device)
6684
tokenizer = transformers.AutoTokenizer.from_pretrained(model_name)
6785
description_tokenizer = transformers.AutoTokenizer.from_pretrained(model.config.text_encoder._name_or_path)
68-
model_sample_rate = model.config.sampling_rate
86+
model_sampling_rate = model.config.sampling_rate
87+
88+
_tts_models[model_name] = (model, tokenizer, description_tokenizer, model_sampling_rate)
6989

70-
_tts_models[model_name] = (model, tokenizer, description_tokenizer, model_sample_rate)
71-
72-
(model, tokenizer, description_tokenizer, model_sample_rate) = _tts_models[model_name]
90+
(model, tokenizer, description_tokenizer, model_sampling_rate) = _tts_models[model_name]
7391
model = model.to(device)
7492

7593
text_tokens = tokenizer(text, return_tensors="pt").input_ids.to(device)
@@ -81,23 +99,24 @@ def text_to_speech(
8199

82100
audio_tensor = audio_tensor.to("cpu")
83101

84-
if sample_rate != model_sample_rate:
85-
audio_tensor = torchaudio.transforms.Resample(model_sample_rate, sample_rate)(audio_tensor)
102+
if sampling_rate != model_sampling_rate:
103+
audio_tensor = torchaudio.transforms.Resample(model_sampling_rate, sampling_rate)(audio_tensor)
86104

87105
audio_tensor = audio_tensor.numpy()
88106

89107
return audio_tensor
90108

91109
if __name__ == "__main__":
92110

93-
from audio import save_audio
94-
import os
95-
96111
import argparse
112+
import os
113+
114+
from audio import save_audio
97115
parser = argparse.ArgumentParser()
98116
parser.add_argument("words", type=str, nargs="+", help="Text to convert to speech")
99117
parser.add_argument("--device", type=str, default=None, help="Device to use for inference")
100-
parser.add_argument("--model_name", type=str, default="parler-tts/parler-tts-mini-multilingual-v1.1", help="Model name or path")
118+
parser.add_argument("--model_name", type=str, default="parler-tts/parler-tts-mini-multilingual-v1.1",
119+
help="Model name or path")
101120
parser.add_argument("--output", type=str, default="out", help="Output folder name")
102121
parser.add_argument("--num", type=int, default=10, help="Number of generations")
103122
args = parser.parse_args()

0 commit comments

Comments
 (0)