Skip to content

Commit 701f733

Browse files
Aleksei KorobeinikovVoronovaInterneaidovaanzhella-pankratova
authored
OV 2.0: python text_to_speech demo (#3038)
* first version * update for melgan * some fixes * update rnn_width calue * update name for create infer request function * add tests and parsing string input * remove mistakes * remove mistakes2 * update readme.md * update cases.py * remove mistakes3 * update readme * update requirements.txt * remove mistakes in requirements.txt * update requirements-demos.txt * Update ci/requirements-demos.txt Co-authored-by: Ekaterina Aidova <[email protected]> * update reqs for demo * amend * change input and location tkinter import * Update demos/text_to_speech_demo/python/models/forward_tacotron_ie.py * Update demos/text_to_speech_demo/python/models/forward_tacotron_ie.py Co-authored-by: VoronovaIntern <[email protected]> Co-authored-by: Ekaterina Aidova <[email protected]> Co-authored-by: Anzhella Pankratova <[email protected]>
1 parent 6044340 commit 701f733

File tree

8 files changed

+165
-110
lines changed

8 files changed

+165
-110
lines changed

ci/requirements-demos.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -154,6 +154,7 @@ xmltodict==0.12.0
154154
# via motmetrics
155155
zipp==3.5.0
156156
# via importlib-metadata
157+
inflect==5.3.0
157158

158159
# The following packages are considered to be unsafe in a requirements file:
159160
# setuptools

demos/requirements.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,3 +10,4 @@ sympy>=1.8
1010
tokenizers~=0.10.1
1111
tensorboardX>=2.1
1212
tqdm>=4.54.1
13+
inflect>=5.3.0

demos/tests/cases.py

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1307,6 +1307,32 @@ def single_option_cases(key, *args):
13071307
]
13081308
)),
13091309

1310+
PythonDemo(name='text_to_speech_demo', device_keys=['-d'],
1311+
model_keys=['-m_duration', '-m_forward', '-m_upsample', '-m_rnn', '-m_melgan'], test_cases=combine_cases(
1312+
TestCase(options={'-i': [
1313+
'The quick brown fox jumps over the lazy dog.',
1314+
'The five boxing wizards jump quickly.'
1315+
]}),
1316+
[
1317+
TestCase(options={
1318+
'-m_duration': ModelArg('forward-tacotron-duration-prediction'),
1319+
'-m_forward': ModelArg('forward-tacotron-regression'),
1320+
'-m_upsample': ModelArg('wavernn-upsampler'),
1321+
'-m_rnn': ModelArg('wavernn-rnn')
1322+
}),
1323+
TestCase(options={
1324+
'-m_duration': ModelArg('text-to-speech-en-0001-duration-prediction'),
1325+
'-m_forward': ModelArg('text-to-speech-en-0001-regression'),
1326+
'-m_melgan': ModelArg('text-to-speech-en-0001-generation')
1327+
}),
1328+
TestCase(options={
1329+
'-m_duration': ModelArg('text-to-speech-en-multi-0001-duration-prediction'),
1330+
'-m_forward': ModelArg('text-to-speech-en-multi-0001-regression'),
1331+
'-m_melgan': ModelArg('text-to-speech-en-multi-0001-generation')
1332+
}),
1333+
]
1334+
)),
1335+
13101336
PythonDemo(name='whiteboard_inpainting_demo', device_keys=['-d'],
13111337
model_keys=['-m_i', '-m_s'], test_cases=combine_cases(
13121338
TestCase(options={'-i': TestDataArg('msasl/global_crops/_nz_sivss20/clip_0017/img_%05d.jpg'),

demos/text_to_speech_demo/python/README.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -63,7 +63,7 @@ Options:
6363
Required. Path to ForwardTacotron`s mel-spectrogram
6464
regression part (*.xml format).
6565
-i INPUT, --input INPUT
66-
Required. Text file with text.
66+
Required. Text or path to the input file.
6767
-o OUT, --out OUT Optional. Path to an output .wav file
6868
-d DEVICE, --device DEVICE
6969
Optional. Specify the target device to infer on; CPU,

demos/text_to_speech_demo/python/models/forward_tacotron_ie.py

Lines changed: 34 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
"""
2-
Copyright (c) 2020 Intel Corporation
2+
Copyright (c) 2020-2022 Intel Corporation
33
44
Licensed under the Apache License, Version 2.0 (the "License");
55
you may not use this file except in compliance with the License.
@@ -23,31 +23,39 @@
2323
from utils.embeddings_processing import PCA
2424

2525

26+
def check_input_name(model, input_tensor_name):
27+
try:
28+
model.input(input_tensor_name)
29+
return True
30+
except RuntimeError:
31+
return False
32+
33+
2634
class ForwardTacotronIE:
2735
def __init__(self, model_duration, model_forward, ie, device='CPU', verbose=False):
2836
self.verbose = verbose
2937
self.device = device
3038
self.ie = ie
3139

32-
self.duration_predictor_net = self.load_network(model_duration)
33-
self.duration_predictor_exec = self.create_exec_network(self.duration_predictor_net, model_duration)
40+
self.duration_predictor_model = self.load_network(model_duration)
41+
self.duration_predictor_request = self.create_infer_request(self.duration_predictor_model, model_duration)
3442

35-
self.forward_net = self.load_network(model_forward)
36-
self.forward_exec = self.create_exec_network(self.forward_net, model_forward)
43+
self.forward_model = self.load_network(model_forward)
44+
self.forward_request = self.create_infer_request(self.forward_model, model_forward)
3745

3846
# fixed length of the sequence of symbols
39-
self.duration_len = self.duration_predictor_net.input_info['input_seq'].input_data.shape[1]
47+
self.duration_len = self.duration_predictor_model.input('input_seq').shape[1]
4048
# fixed length of the input embeddings for forward
41-
self.forward_len = self.forward_net.input_info['data'].input_data.shape[1]
49+
self.forward_len = self.forward_model.input('data').shape[1]
4250
if self.verbose:
4351
log.debug('Forward limitations : {0} symbols and {1} embeddings'.format(self.duration_len, self.forward_len))
44-
self.is_attention = 'pos_mask' in self.forward_net.input_info
52+
self.is_attention = check_input_name(self.forward_model, "pos_mask")
4553
if self.is_attention:
4654
self.init_pos_mask()
4755
else:
4856
self.pos_mask = None
4957

50-
self.is_multi_speaker = self.has_speaker_embeddings()
58+
self.is_multi_speaker = check_input_name(self.duration_predictor_model, "speaker_embedding")
5159
if self.is_multi_speaker:
5260
self.init_speaker_information()
5361
else:
@@ -106,13 +114,13 @@ def load_network(self, model_xml):
106114
model_bin_name = ".".join(osp.basename(model_xml).split('.')[:-1]) + ".bin"
107115
model_bin = osp.join(osp.dirname(model_xml), model_bin_name)
108116
log.info('Reading ForwardTacotron model {}'.format(model_xml))
109-
net = self.ie.read_network(model=model_xml, weights=model_bin)
110-
return net
117+
model = self.ie.read_model(model=model_xml, weights=model_bin)
118+
return model
111119

112-
def create_exec_network(self, net, path):
113-
exec_net = self.ie.load_network(network=net, device_name=self.device)
120+
def create_infer_request(self, model, path):
121+
compiled_model = self.ie.compile_model(model, device_name=self.device)
114122
log.info('The ForwardTacotron model {} is loaded to {}'.format(path, self.device))
115-
return exec_net
123+
return compiled_model.create_infer_request()
116124

117125
def infer_duration(self, sequence, speaker_embedding=None, alpha=1.0, non_empty_symbols=None):
118126
if self.is_attention:
@@ -122,15 +130,15 @@ def infer_duration(self, sequence, speaker_embedding=None, alpha=1.0, non_empty_
122130
"input_mask": input_mask,
123131
"pos_mask": pos_mask}
124132
if speaker_embedding is not None:
125-
inputs["speaker_embedding"] = speaker_embedding
126-
out = self.duration_predictor_exec.infer(inputs)
133+
inputs["speaker_embedding"] = np.array([speaker_embedding])
134+
self.duration_predictor_request.infer(inputs)
127135
else:
128-
out = self.duration_predictor_exec.infer(inputs={"input_seq": sequence})
129-
duration = out["duration"] * alpha
136+
self.duration_predictor_request.infer(inputs={"input_seq": sequence})
137+
duration = self.duration_predictor_request.get_tensor("duration").data[:] * alpha
130138

131139
duration = (duration + 0.5).astype('int').flatten()
132140
duration = np.expand_dims(duration, axis=0)
133-
preprocessed_embeddings = out["embeddings"]
141+
preprocessed_embeddings = self.duration_predictor_request.get_tensor("embeddings").data[:]
134142

135143
if non_empty_symbols is not None:
136144
duration = duration[:, :non_empty_symbols]
@@ -150,11 +158,11 @@ def infer_mel(self, aligned_emb, non_empty_symbols, speaker_embedding=None):
150158
"data_mask": data_mask,
151159
"pos_mask": pos_mask}
152160
if speaker_embedding is not None:
153-
inputs["speaker_embedding"] = speaker_embedding
154-
out = self.forward_exec.infer(inputs)
161+
inputs["speaker_embedding"] = np.array([speaker_embedding])
162+
self.forward_request.infer(inputs)
155163
else:
156-
out = self.forward_exec.infer(inputs={"data": aligned_emb})
157-
return out['mel'][:, :non_empty_symbols]
164+
self.forward_request.infer(inputs={"data": aligned_emb})
165+
return self.forward_request.get_tensor('mel').data[:, :non_empty_symbols]
158166

159167
def find_optimal_delimiters_position(self, sequence, delimiters, idx, window=20):
160168
res = {d: -1 for d in delimiters}
@@ -238,16 +246,12 @@ def forward(self, text, alpha=1.0, speaker_id=19, speaker_emb=None):
238246
return res
239247

240248
def get_speaker_embeddings(self):
241-
if self.has_speaker_embeddings():
249+
if self.is_multi_speaker:
242250
return self.speaker_embeddings
243-
else:
244-
return None
245-
246-
def has_speaker_embeddings(self):
247-
return 'speaker_embedding' in self.duration_predictor_net.input_info
251+
return None
248252

249253
def get_pca_speaker_embedding(self, gender, alpha):
250-
if not self.has_speaker_embeddings():
254+
if not self.is_multi_speaker:
251255
return None
252256

253257
emb = self.male_embeddings if gender == "Male" else self.female_embeddings

demos/text_to_speech_demo/python/models/mel2wave_ie.py

Lines changed: 50 additions & 42 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
"""
2-
Copyright (c) 2020 Intel Corporation
2+
Copyright (c) 2020-2022 Intel Corporation
33
44
Licensed under the Apache License, Version 2.0 (the "License");
55
you may not use this file except in compliance with the License.
@@ -19,6 +19,7 @@
1919

2020
import numpy as np
2121

22+
from openvino.runtime import PartialShape, set_batch, Layout
2223
from utils.wav_processing import (
2324
fold_with_overlap, infer_from_discretized_mix_logistic, pad_tensor, xfade_and_unfold,
2425
)
@@ -49,37 +50,41 @@ def __init__(self, model_upsample, model_rnn, ie, target=11000, overlap=550, hop
4950
self.batch_sizes = [1, 2, 4, 8, 16, 32, 64, 128, 256]
5051
self.ie = ie
5152

52-
self.upsample_net = self.load_network(model_upsample)
53+
self.upsample_model = self.load_network(model_upsample)
5354
if upsampler_width > 0:
54-
orig_shape = self.upsample_net.input_info['mels'].input_data.shape
55-
self.upsample_net.reshape({"mels": (orig_shape[0], upsampler_width, orig_shape[2])})
55+
orig_shape = self.upsample_model.input('mels').shape
56+
self.upsample_model.reshape({"mels": PartialShape([orig_shape[0], upsampler_width, orig_shape[2]])})
5657

57-
self.upsample_exec = self.create_exec_network(self.upsample_net, model_upsample)
58+
self.upsample_request = self.create_infer_requests(self.upsample_model, model_upsample)
5859

59-
self.rnn_net = self.load_network(model_rnn)
60-
self.rnn_exec = self.create_exec_network(self.rnn_net, model_rnn, batch_sizes=self.batch_sizes)
60+
self.rnn_model = self.load_network(model_rnn)
61+
self.rnn_requests = self.create_infer_requests(self.rnn_model, model_rnn, batch_sizes=self.batch_sizes)
6162

6263
# fixed number of the mels in mel-spectrogramm
63-
self.mel_len = self.upsample_net.input_info['mels'].input_data.shape[1] - 2 * self.pad
64-
self.rnn_width = self.rnn_net.input_info['x'].input_data.shape[1]
64+
self.mel_len = self.upsample_model.input('mels').shape[1] - 2 * self.pad
65+
self.rnn_width = self.rnn_model.input('h1.1').shape[1]
6566

6667
def load_network(self, model_xml):
6768
model_bin_name = ".".join(osp.basename(model_xml).split('.')[:-1]) + ".bin"
6869
model_bin = osp.join(osp.dirname(model_xml), model_bin_name)
6970
log.info('Reading WaveRNN model {}'.format(model_xml))
70-
net = self.ie.read_network(model=model_xml, weights=model_bin)
71-
return net
71+
model = self.ie.read_model(model=model_xml, weights=model_bin)
72+
return model
7273

73-
def create_exec_network(self, net, path, batch_sizes=None):
74+
def create_infer_requests(self, model, path, batch_sizes=None):
7475
if batch_sizes is not None:
75-
exec_net = []
76+
requests = []
77+
for parameter in model.get_parameters():
78+
parameter.set_layout(Layout("BC"))
7679
for b_s in batch_sizes:
77-
net.batch_size = b_s
78-
exec_net.append(self.ie.load_network(network=net, device_name=self.device))
80+
set_batch(model, b_s)
81+
compiled_model = self.ie.compile_model(model, device_name=self.device)
82+
requests.append(compiled_model.create_infer_request())
7983
else:
80-
exec_net = self.ie.load_network(network=net, device_name=self.device)
84+
compiled_model = self.ie.compile_model(model, device_name=self.device)
85+
requests = compiled_model.create_infer_request()
8186
log.info('The WaveRNN model {} is loaded to {}'.format(path, self.device))
82-
return exec_net
87+
return requests
8388

8489
@staticmethod
8590
def get_rnn_init_states(b_size=1, rnn_dims=328):
@@ -133,8 +138,9 @@ def forward(self, mels):
133138
def forward_upsample(self, mels):
134139
mels = pad_tensor(mels, pad=self.pad)
135140

136-
out = self.upsample_exec.infer(inputs={"mels": mels})
137-
upsample_mels, aux = out["upsample_mels"][:, self.indent:-self.indent, :], out["aux"]
141+
self.upsample_request.infer(inputs={"mels": mels})
142+
upsample_mels = self.upsample_request.get_tensor("upsample_mels").data[:, self.indent:-self.indent, :]
143+
aux = self.upsample_request.get_tensor("aux").data[:]
138144
return upsample_mels, aux
139145

140146
def forward_rnn(self, mels, upsampled_mels, aux):
@@ -160,13 +166,12 @@ def forward_rnn(self, mels, upsampled_mels, aux):
160166

161167
a1_t, a2_t, a3_t, a4_t = \
162168
(a[:, i, :] for a in aux_split)
169+
self.rnn_requests[active_network].infer(inputs={"m_t": m_t, "a1_t": a1_t, "a2_t": a2_t, "a3_t": a3_t,
170+
"a4_t": a4_t, "h1.1": h1, "h2.1": h2, "x": x})
163171

164-
out = self.rnn_exec[active_network].infer(inputs={"m_t": m_t, "a1_t": a1_t, "a2_t": a2_t, "a3_t": a3_t,
165-
"a4_t": a4_t, "h1.1": h1, "h2.1": h2, "x": x})
166-
167-
logits = out["logits"]
168-
h1 = out["h1"]
169-
h2 = out["h2"]
172+
logits = self.rnn_requests[active_network].get_tensor('logits').data[:]
173+
h1 = self.rnn_requests[active_network].get_tensor('h1').data[:]
174+
h2 = self.rnn_requests[active_network].get_tensor('h2').data[:]
170175

171176
sample = infer_from_discretized_mix_logistic(logits)
172177

@@ -204,38 +209,40 @@ def __init__(self, model, ie, device='CPU', default_width=800):
204209
self.scales = 4
205210
self.hop_length = 256
206211

207-
self.net = self.load_network(model)
208-
if self.net.input_info['mel'].input_data.shape[2] != default_width:
209-
orig_shape = self.net.input_info['mel'].input_data.shape
212+
self.model = self.load_network(model)
213+
if self.model.input('mel').shape[2] != default_width:
214+
orig_shape = self.model.input('mel').shape
210215
new_shape = (orig_shape[0], orig_shape[1], default_width)
211-
self.net.reshape({"mel": new_shape})
216+
self.model.reshape({"mel": PartialShape([new_shape[0], new_shape[1], new_shape[2]])})
212217

213-
self.exec_net = self.create_exec_network(self.net, self.scales)
218+
self.requests = self.create_infer_requests(self.model, model, self.scales)
214219

215220
# fixed number of columns in mel-spectrogramm
216-
self.mel_len = self.net.input_info['mel'].input_data.shape[2]
221+
self.mel_len = self.model.input('mel').shape[2]
217222
self.widths = [self.mel_len * (i + 1) for i in range(self.scales)]
218223

219224
def load_network(self, model_xml):
220225
model_bin_name = ".".join(osp.basename(model_xml).split('.')[:-1]) + ".bin"
221226
model_bin = osp.join(osp.dirname(model_xml), model_bin_name)
222227
log.info('Reading MelGAN model {}'.format(model_xml))
223-
net = self.ie.read_network(model=model_xml, weights=model_bin)
224-
return net
228+
model = self.ie.read_model(model=model_xml, weights=model_bin)
229+
return model
225230

226-
def create_exec_network(self, net, path, scales=None):
231+
def create_infer_requests(self, model, path, scales=None):
227232
if scales is not None:
228-
orig_shape = net.input_info['mel'].input_data.shape
229-
exec_net = []
233+
orig_shape = model.input('mel').shape
234+
requests = []
230235
for i in range(scales):
231236
new_shape = (orig_shape[0], orig_shape[1], orig_shape[2] * (i + 1))
232-
net.reshape({"mel": new_shape})
233-
exec_net.append(self.ie.load_network(network=net, device_name=self.device))
234-
net.reshape({"mel": orig_shape})
237+
model.reshape({"mel": PartialShape([new_shape[0], new_shape[1], new_shape[2]])})
238+
compiled_model = self.ie.compile_model(model, device_name=self.device)
239+
requests.append(compiled_model.create_infer_request())
240+
model.reshape({"mel": PartialShape([orig_shape[0], orig_shape[1], orig_shape[2]])})
235241
else:
236-
exec_net = self.ie.load_network(network=net, device_name=self.device)
242+
compiled_model = self.ie.compile_model(model, device_name=self.device)
243+
requests = compiled_model.create_infer_request()
237244
log.info('The MelGAN model {} is loaded to {}'.format(path, self.device))
238-
return exec_net
245+
return requests
239246

240247
def forward(self, mel):
241248
mel = np.expand_dims(mel, axis=0)
@@ -261,7 +268,8 @@ def forward(self, mel):
261268
c_begin = 0
262269
c_end = cur_w
263270
while c_begin < cols:
264-
audio = self.exec_net[active_net].infer(inputs={"mel": mel[:, :, c_begin:c_end]})["audio"]
271+
self.requests[active_net].infer(inputs={"mel": mel[:, :, c_begin:c_end]})
272+
audio = self.requests[active_net].get_tensor("audio").data[:]
265273
res_audio.extend(audio)
266274

267275
c_begin = c_end

0 commit comments

Comments
 (0)