Skip to content

Commit 9632381

Browse files
committed
fix yamls, change labels to stop_labels, test=tts
1 parent 1bf1a87 commit 9632381

File tree

14 files changed

+53
-73
lines changed

14 files changed

+53
-73
lines changed

examples/aishell3/tts3/conf/default.yaml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -16,8 +16,8 @@ fmax: 7600 # Maximum frequency of Mel basis.
1616
n_mels: 80 # The number of mel basis.
1717

1818
# Only used for the model using pitch features (e.g. FastSpeech2)
19-
f0min: 80 # Maximum f0 for pitch extraction.
20-
f0max: 400 # Minimum f0 for pitch extraction.
19+
f0min: 80 # Minimum f0 for pitch extraction.
20+
f0max: 400 # Maximum f0 for pitch extraction.
2121

2222

2323
###########################################################

examples/aishell3/vc1/conf/default.yaml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -16,8 +16,8 @@ fmax: 7600 # Maximum frequency of Mel basis.
1616
n_mels: 80 # The number of mel basis.
1717

1818
# Only used for the model using pitch features (e.g. FastSpeech2)
19-
f0min: 80 # Maximum f0 for pitch extraction.
20-
f0max: 400 # Minimum f0 for pitch extraction.
19+
f0min: 80 # Minimum f0 for pitch extraction.
20+
f0max: 400 # Maximum f0 for pitch extraction.
2121

2222

2323
###########################################################

examples/csmsc/tts0/conf/default.yaml

Lines changed: 0 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -21,10 +21,6 @@ fmin: 80 # Minimum frequency of Mel basis.
2121
fmax: 7600 # Maximum frequency of Mel basis.
2222
n_mels: 80 # The number of mel basis.
2323

24-
# Only used for the model using pitch features (e.g. FastSpeech2)
25-
f0min: 80 # Maximum f0 for pitch extraction.
26-
f0max: 400 # Minimum f0 for pitch extraction.
27-
2824
###########################################################
2925
# DATA SETTING #
3026
###########################################################

examples/csmsc/tts3/conf/conformer.yaml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -16,8 +16,8 @@ fmax: 7600 # Maximum frequency of Mel basis.
1616
n_mels: 80 # The number of mel basis.
1717

1818
# Only used for the model using pitch features (e.g. FastSpeech2)
19-
f0min: 80 # Maximum f0 for pitch extraction.
20-
f0max: 400 # Minimum f0 for pitch extraction.
19+
f0min: 80 # Minimum f0 for pitch extraction.
20+
f0max: 400 # Maximum f0 for pitch extraction.
2121

2222

2323
###########################################################

examples/csmsc/tts3/conf/default.yaml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -16,8 +16,8 @@ fmax: 7600 # Maximum frequency of Mel basis.
1616
n_mels: 80 # The number of mel basis.
1717

1818
# Only used for the model using pitch features (e.g. FastSpeech2)
19-
f0min: 80 # Maximum f0 for pitch extraction.
20-
f0max: 400 # Minimum f0 for pitch extraction.
19+
f0min: 80 # Minimum f0 for pitch extraction.
20+
f0max: 400 # Maximum f0 for pitch extraction.
2121

2222

2323
###########################################################

examples/ljspeech/tts3/conf/default.yaml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -16,8 +16,8 @@ fmax: 7600 # Maximum frequency of Mel basis.
1616
n_mels: 80 # The number of mel basis.
1717

1818
# Only used for the model using pitch features (e.g. FastSpeech2)
19-
f0min: 80 # Maximum f0 for pitch extraction.
20-
f0max: 400 # Minimum f0 for pitch extraction.
19+
f0min: 80 # Minimum f0 for pitch extraction.
20+
f0max: 400 # Maximum f0 for pitch extraction.
2121

2222

2323
###########################################################

examples/vctk/tts3/conf/default.yaml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -16,8 +16,8 @@ fmax: 7600 # Maximum frequency of Mel basis.
1616
n_mels: 80 # The number of mel basis.
1717

1818
# Only used for the model using pitch features (e.g. FastSpeech2)
19-
f0min: 80 # Maximum f0 for pitch extraction.
20-
f0max: 400 # Minimum f0 for pitch extraction.
19+
f0min: 80 # Minimum f0 for pitch extraction.
20+
f0max: 400 # Maximum f0 for pitch extraction.
2121

2222

2323
###########################################################

paddlespeech/t2s/exps/new_tacotron2/preprocess.py

Lines changed: 1 addition & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -27,9 +27,7 @@
2727
import yaml
2828
from yacs.config import CfgNode
2929

30-
from paddlespeech.t2s.data.get_feats import Energy
3130
from paddlespeech.t2s.data.get_feats import LogMelFBank
32-
from paddlespeech.t2s.data.get_feats import Pitch
3331
from paddlespeech.t2s.datasets.preprocess_utils import compare_duration_and_mel_length
3432
from paddlespeech.t2s.datasets.preprocess_utils import get_input_token
3533
from paddlespeech.t2s.datasets.preprocess_utils import get_phn_dur
@@ -42,8 +40,6 @@ def process_sentence(config: Dict[str, Any],
4240
sentences: Dict,
4341
output_dir: Path,
4442
mel_extractor=None,
45-
pitch_extractor=None,
46-
energy_extractor=None,
4743
cut_sil: bool=True,
4844
spk_emb_dir: Path=None):
4945
utt_id = fp.stem
@@ -117,17 +113,14 @@ def process_sentences(config,
117113
sentences: Dict,
118114
output_dir: Path,
119115
mel_extractor=None,
120-
pitch_extractor=None,
121-
energy_extractor=None,
122116
nprocs: int=1,
123117
cut_sil: bool=True,
124118
spk_emb_dir: Path=None):
125119
if nprocs == 1:
126120
results = []
127121
for fp in fps:
128122
record = process_sentence(config, fp, sentences, output_dir,
129-
mel_extractor, pitch_extractor,
130-
energy_extractor, cut_sil, spk_emb_dir)
123+
mel_extractor, cut_sil, spk_emb_dir)
131124
if record:
132125
results.append(record)
133126
else:
@@ -137,7 +130,6 @@ def process_sentences(config,
137130
for fp in fps:
138131
future = pool.submit(process_sentence, config, fp,
139132
sentences, output_dir, mel_extractor,
140-
pitch_extractor, energy_extractor,
141133
cut_sil, spk_emb_dir)
142134
future.add_done_callback(lambda p: progress.update())
143135
futures.append(future)
@@ -299,17 +291,6 @@ def str2bool(str):
299291
n_mels=config.n_mels,
300292
fmin=config.fmin,
301293
fmax=config.fmax)
302-
pitch_extractor = Pitch(
303-
sr=config.fs,
304-
hop_length=config.n_shift,
305-
f0min=config.f0min,
306-
f0max=config.f0max)
307-
energy_extractor = Energy(
308-
sr=config.fs,
309-
n_fft=config.n_fft,
310-
hop_length=config.n_shift,
311-
win_length=config.win_length,
312-
window=config.window)
313294

314295
# process for the 3 sections
315296
if train_wav_files:
@@ -319,8 +300,6 @@ def str2bool(str):
319300
sentences,
320301
train_dump_dir,
321302
mel_extractor,
322-
pitch_extractor,
323-
energy_extractor,
324303
nprocs=args.num_cpu,
325304
cut_sil=args.cut_sil,
326305
spk_emb_dir=spk_emb_dir)
@@ -331,8 +310,6 @@ def str2bool(str):
331310
sentences,
332311
dev_dump_dir,
333312
mel_extractor,
334-
pitch_extractor,
335-
energy_extractor,
336313
cut_sil=args.cut_sil,
337314
spk_emb_dir=spk_emb_dir)
338315
if test_wav_files:
@@ -342,8 +319,6 @@ def str2bool(str):
342319
sentences,
343320
test_dump_dir,
344321
mel_extractor,
345-
pitch_extractor,
346-
energy_extractor,
347322
nprocs=args.num_cpu,
348323
cut_sil=args.cut_sil,
349324
spk_emb_dir=spk_emb_dir)

paddlespeech/t2s/models/new_tacotron2/tacotron2.py

Lines changed: 7 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -300,10 +300,10 @@ def forward(
300300
olens = speech_lengths
301301

302302
# make labels for stop prediction
303-
labels = make_pad_mask(olens - 1)
303+
stop_labels = make_pad_mask(olens - 1)
304304
# bool 类型无法切片
305-
labels = paddle.cast(labels, dtype='float32')
306-
labels = F.pad(labels, [0, 0, 0, 1], "constant", 1.0)
305+
stop_labels = paddle.cast(stop_labels, dtype='float32')
306+
stop_labels = F.pad(stop_labels, [0, 0, 0, 1], "constant", 1.0)
307307

308308
# calculate tacotron2 outputs
309309
after_outs, before_outs, logits, att_ws = self._forward(
@@ -322,12 +322,13 @@ def forward(
322322
olens = olens - olens % self.reduction_factor
323323
max_out = max(olens)
324324
ys = ys[:, :max_out]
325-
labels = labels[:, :max_out]
326-
labels = paddle.scatter(labels, 1, (olens - 1).unsqueeze(1), 1.0)
325+
stop_labels = stop_labels[:, :max_out]
326+
stop_labels = paddle.scatter(stop_labels, 1,
327+
(olens - 1).unsqueeze(1), 1.0)
327328
olens_in = olens // self.reduction_factor
328329
else:
329330
olens_in = olens
330-
return after_outs, before_outs, logits, ys, labels, olens, att_ws, olens_in
331+
return after_outs, before_outs, logits, ys, stop_labels, olens, att_ws, olens_in
331332

332333
def _forward(
333334
self,

paddlespeech/t2s/models/new_tacotron2/tacotron2_updater.py

Lines changed: 16 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -74,7 +74,7 @@ def update_core(self, batch):
7474
if spk_emb is not None:
7575
spk_id = None
7676

77-
after_outs, before_outs, logits, ys, labels, olens, att_ws, olens_in = self.model(
77+
after_outs, before_outs, logits, ys, stop_labels, olens, att_ws, olens_in = self.model(
7878
text=batch["text"],
7979
text_lengths=batch["text_lengths"],
8080
speech=batch["speech"],
@@ -83,8 +83,13 @@ def update_core(self, batch):
8383
spk_emb=spk_emb)
8484

8585
# calculate taco2 loss
86-
l1_loss, mse_loss, bce_loss = self.taco2_loss(after_outs, before_outs,
87-
logits, ys, labels, olens)
86+
l1_loss, mse_loss, bce_loss = self.taco2_loss(
87+
after_outs=after_outs,
88+
before_outs=before_outs,
89+
logits=logits,
90+
ys=ys,
91+
stop_labels=stop_labels,
92+
olens=olens)
8893

8994
if self.loss_type == "L1+L2":
9095
loss = l1_loss + mse_loss + bce_loss
@@ -164,7 +169,7 @@ def evaluate_core(self, batch):
164169
if spk_emb is not None:
165170
spk_id = None
166171

167-
after_outs, before_outs, logits, ys, labels, olens, att_ws, olens_in = self.model(
172+
after_outs, before_outs, logits, ys, stop_labels, olens, att_ws, olens_in = self.model(
168173
text=batch["text"],
169174
text_lengths=batch["text_lengths"],
170175
speech=batch["speech"],
@@ -173,8 +178,13 @@ def evaluate_core(self, batch):
173178
spk_emb=spk_emb)
174179

175180
# calculate taco2 loss
176-
l1_loss, mse_loss, bce_loss = self.taco2_loss(after_outs, before_outs,
177-
logits, ys, labels, olens)
181+
l1_loss, mse_loss, bce_loss = self.taco2_loss(
182+
after_outs=after_outs,
183+
before_outs=before_outs,
184+
logits=logits,
185+
ys=ys,
186+
stop_labels=stop_labels,
187+
olens=olens)
178188

179189
if self.loss_type == "L1+L2":
180190
loss = l1_loss + mse_loss + bce_loss

0 commit comments

Comments
 (0)