Skip to content

Commit 4ec9536

Browse files
committed
add new processor for baker
1 parent e77b9f8 commit 4ec9536

File tree

1 file changed

+110
-94
lines changed

1 file changed

+110
-94
lines changed

tensorflow_tts/processor/baker.py

Lines changed: 110 additions & 94 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
import os
2+
import re
23
import numpy as np
34
import librosa
45
import soundfile as sf
@@ -10,28 +11,29 @@
1011

1112
_pad = ['_']
1213
_eos = ['~']
13-
_pause = ['sil', 'sp1']
14-
_initials = ['b', 'c', 'ch', 'd', 'f', 'g', 'h', 'j', 'k', 'l', 'm', 'n', 'p', 'q', 'r', 's', 'sh', 't', 'x', 'z', 'zh']
14+
_pause = ['sil', '#0', '#1', '#2', '#3']
15+
16+
_initials = ['^', 'b', 'c', 'ch', 'd', 'f', 'g', 'h', 'j', 'k', 'l', 'm', 'n', 'p', 'q', 'r', 's', 'sh', 't', 'x', 'z', 'zh']
17+
1518
_tones = ['1', '2', '3', '4', '5']
16-
_finals = ['a', 'ai', 'an', 'ang', 'ao', 'e', 'ei', 'en', 'eng', 'er', 'i', 'ia', 'ian', 'iang', 'iao', 'ie',
17-
'ii', 'iii', 'in', 'ing', 'iong', 'iou', 'o', 'ong', 'ou', 'u', 'ua', 'uai', 'uan', 'uang',
18-
'uei', 'uen', 'ueng', 'uo', 'v', 'van', 've', 'vn']
19-
_special = ['io5']
2019

21-
symbols = _pad + _pause + _initials + [i + j for i in _finals for j in _tones] + _special + _eos
20+
_finals = ['a', 'ai', 'an', 'ang', 'ao', 'e', 'ei', 'en', 'eng', 'er', 'i', 'ia', 'ian', 'iang', 'iao', 'ie', 'ii',
21+
'iii', 'in', 'ing', 'iong', 'iou', 'o', 'ong', 'ou', 'u', 'ua', 'uai', 'uan', 'uang', 'uei', 'uen', 'ueng',
22+
'uo', 'v', 'van', 've', 'vn']
23+
24+
symbols = _pad + _pause + _initials + [i + j for i in _finals for j in _tones] + _eos
2225

2326
# Mappings from symbol to numeric ID and vice versa:
2427
_symbol_to_id = {s: i for i, s in enumerate(symbols)}
2528
_id_to_symbol = {i: s for i, s in enumerate(symbols)}
2629

2730

28-
2931
pinyin_dict = {
30-
'a': ('', 'a'),
31-
'ai': ('', 'ai'),
32-
'an': ('', 'an'),
33-
'ang': ('', 'ang'),
34-
'ao': ('', 'ao'),
32+
'a': ('^', 'a'),
33+
'ai': ('^', 'ai'),
34+
'an': ('^', 'an'),
35+
'ang': ('^', 'ang'),
36+
'ao': ('^', 'ao'),
3537
'ba': ('b', 'a'),
3638
'bai': ('b', 'ai'),
3739
'ban': ('b', 'an'),
@@ -107,12 +109,12 @@
107109
'dui': ('d', 'uei'),
108110
'dun': ('d', 'uen'),
109111
'duo': ('d', 'uo'),
110-
'e': ('', 'e'),
111-
'ei': ('', 'ei'),
112-
'en': ('', 'en'),
113-
'ng': ('', 'en'),
114-
'eng': ('', 'eng'),
115-
'er': ('', 'er'),
112+
'e': ('^', 'e'),
113+
'ei': ('^', 'ei'),
114+
'en': ('^', 'en'),
115+
'ng': ('^', 'en'),
116+
'eng': ('^', 'eng'),
117+
'er': ('^', 'er'),
116118
'fa': ('f', 'a'),
117119
'fan': ('f', 'an'),
118120
'fang': ('f', 'ang'),
@@ -265,8 +267,8 @@
265267
'nve': ('n', 've'),
266268
'nue': ('n', 've'),
267269
'nuo': ('n', 'uo'),
268-
'o': ('', 'o'),
269-
'ou': ('', 'ou'),
270+
'o': ('^', 'o'),
271+
'ou': ('^', 'ou'),
270272
'pa': ('p', 'a'),
271273
'pai': ('p', 'ai'),
272274
'pan': ('p', 'an'),
@@ -369,15 +371,15 @@
369371
'tui': ('t', 'uei'),
370372
'tun': ('t', 'uen'),
371373
'tuo': ('t', 'uo'),
372-
'wa': ('', 'ua'),
373-
'wai': ('', 'uai'),
374-
'wan': ('', 'uan'),
375-
'wang': ('', 'uang'),
376-
'wei': ('', 'uei'),
377-
'wen': ('', 'uen'),
378-
'weng': ('', 'ueng'),
379-
'wo': ('', 'uo'),
380-
'wu': ('', 'u'),
374+
'wa': ('^', 'ua'),
375+
'wai': ('^', 'uai'),
376+
'wan': ('^', 'uan'),
377+
'wang': ('^', 'uang'),
378+
'wei': ('^', 'uei'),
379+
'wen': ('^', 'uen'),
380+
'weng': ('^', 'ueng'),
381+
'wo': ('^', 'uo'),
382+
'wu': ('^', 'u'),
381383
'xi': ('x', 'i'),
382384
'xia': ('x', 'ia'),
383385
'xian': ('x', 'ian'),
@@ -392,21 +394,21 @@
392394
'xuan': ('x', 'van'),
393395
'xue': ('x', 've'),
394396
'xun': ('x', 'vn'),
395-
'ya': ('', 'ia'),
396-
'yan': ('', 'ian'),
397-
'yang': ('', 'iang'),
398-
'yao': ('', 'iao'),
399-
'ye': ('', 'ie'),
400-
'yi': ('', 'i'),
401-
'yin': ('', 'in'),
402-
'ying': ('', 'ing'),
403-
'yo': ('', 'iou'),
404-
'yong': ('', 'iong'),
405-
'you': ('', 'iou'),
406-
'yu': ('', 'v'),
407-
'yuan': ('', 'van'),
408-
'yue': ('', 've'),
409-
'yun': ('', 'vn'),
397+
'ya': ('^', 'ia'),
398+
'yan': ('^', 'ian'),
399+
'yang': ('^', 'iang'),
400+
'yao': ('^', 'iao'),
401+
'ye': ('^', 'ie'),
402+
'yi': ('^', 'i'),
403+
'yin': ('^', 'in'),
404+
'ying': ('^', 'ing'),
405+
'yo': ('^', 'iou'),
406+
'yong': ('^', 'iong'),
407+
'you': ('^', 'iou'),
408+
'yu': ('^', 'v'),
409+
'yuan': ('^', 'van'),
410+
'yue': ('^', 've'),
411+
'yun': ('^', 'vn'),
410412
'za': ('z', 'a'),
411413
'zai': ('z', 'ai'),
412414
'zan': ('z', 'an'),
@@ -447,20 +449,11 @@
447449
}
448450

449451

450-
def process_phonelabel(label_file):
451-
with open(label_file, 'r', encoding='utf-8') as f:
452-
lines = f.readlines()[12:]
453-
assert len(lines) % 3 == 0
454-
455-
text = []
456-
for i in range(0, len(lines), 3):
457-
begin = float(lines[i].strip())
458-
if i == 0:
459-
assert begin == 0.
460-
phone = lines[i + 2].strip()
461-
text.append(phone.replace('"', ''))
462-
463-
return text
452+
zh_pattern = re.compile(u'[\u4e00-\u9fa5]')
453+
def is_zh(word):
454+
global zh_pattern
455+
match = zh_pattern.search(word)
456+
return match is not None
464457

465458

466459
class MyConverter(NeutralToneWith5Mixin, DefaultConverter):
@@ -479,41 +472,61 @@ def __init__(self, data_dir, target_rate=24000, cleaner_names=None):
479472
with open(os.path.join(data_dir, 'ProsodyLabeling/000001-010000.txt'), encoding='utf-8') as ttf:
480473
lines = ttf.readlines()
481474
for idx in range(0, len(lines), 2):
482-
utt_id, _ = lines[idx].strip().split()
483-
phonemes = process_phonelabel(os.path.join(data_dir, f'PhoneLabeling/{utt_id}.interval'))
484-
phonemes = self.deal_r(phonemes)
485-
if 'pl' in phonemes or 'ng1' in phonemes:
486-
print(f'Skip this: {utt_id} {phonemes}')
475+
utt_id, chn_char = lines[idx].strip().split()
476+
pinyin = lines[idx+1].strip().split()
477+
if 'IY1' in pinyin or 'B' in chn_char:
478+
print(f'Skip this: {utt_id} {chn_char} {pinyin}')
487479
continue
480+
phonemes = self.get_phoneme_from_char_and_pinyin(chn_char, pinyin)
488481
wav_path = os.path.join(data_dir, 'Wave', '%s.wav' % utt_id)
489482
items.append([' '.join(phonemes), wav_path, self.speaker_name, utt_id])
490483
self.items = items
491-
492-
self.pinyin = self.get_pinyin()
484+
self.pinyin_parser = self.get_pinyin_parser()
493485

494486
@staticmethod
495-
def deal_r(phonemes):
496-
result = []
497-
for p in phonemes:
498-
if p[-1].isdigit() and p[-2] == 'r' and p[:2] != 'er':
499-
result.append(p[:-2] + p[-1])
500-
result.append('er5')
501-
else:
502-
result.append(p)
503-
return result
487+
def get_phoneme_from_char_and_pinyin(chn_char, pinyin):
488+
# we do not need #4, use sil to replace it
489+
chn_char = chn_char.replace('#4', '')
490+
char_len = len(chn_char)
491+
i, j = 0, 0
492+
result = ['sil']
493+
while i < char_len:
494+
cur_char = chn_char[i]
495+
if is_zh(cur_char):
496+
if pinyin[j][:-1] not in pinyin_dict:
497+
assert chn_char[i+1] == '儿'
498+
assert pinyin[j][-2] == 'r'
499+
tone = pinyin[j][-1]
500+
a = pinyin[j][:-2]
501+
a1, a2 = pinyin_dict[a]
502+
result += [a1, a2 + tone, 'er5']
503+
if i+2 < char_len and chn_char[i+2] != '#':
504+
result.append('#0')
504505

505-
@staticmethod
506-
def get_initials_and_finals(text):
507-
result = []
508-
for x in text.split():
509-
assert x[-1].isdigit()
510-
tone = x[-1]
511-
initial, final = pinyin_dict[x[:-1]]
512-
if initial != '':
513-
result.append(initial)
514-
assert final is not ''
515-
result.append(final + tone)
516-
result = ' '.join(result)
506+
i += 2
507+
j += 1
508+
else:
509+
tone = pinyin[j][-1]
510+
a = pinyin[j][:-1]
511+
a1, a2 = pinyin_dict[a]
512+
result += [a1, a2 + tone]
513+
514+
if i + 1 < char_len and chn_char[i + 1] != '#':
515+
result.append('#0')
516+
517+
i += 1
518+
j += 1
519+
elif cur_char == '#':
520+
result.append(chn_char[i:i+2])
521+
i += 2
522+
else:
523+
# ignore the unknown char and punctuation
524+
# result.append(chn_char[i])
525+
i += 1
526+
if result[-1] == '#0':
527+
result = result[:-1]
528+
result.append('sil')
529+
assert j == len(pinyin)
517530
return result
518531

519532
def get_one_sample(self, item):
@@ -545,7 +558,7 @@ def get_one_sample(self, item):
545558

546559
return sample
547560

548-
def get_pinyin(self):
561+
def get_pinyin_parser(self):
549562
my_pinyin = Pinyin(MyConverter())
550563
pinyin = my_pinyin.pinyin
551564
return pinyin
@@ -554,12 +567,15 @@ def text_to_sequence(self, text, inference=False):
554567
global _symbol_to_id
555568

556569
if inference:
557-
text = self.pinyin(text, style=Style.TONE3)
558-
new_text = []
559-
for x in text:
560-
new_text.append(''.join(x))
561-
text = self.get_initials_and_finals(' '.join(new_text))
562-
print(text)
570+
pinyin = self.pinyin_parser(text, style=Style.TONE3)
571+
new_pinyin = []
572+
for x in pinyin:
573+
x = ''.join(x)
574+
if '#' not in x:
575+
new_pinyin.append(x)
576+
phonemes = self.get_phoneme_from_char_and_pinyin(text, new_pinyin)
577+
text = ' '.join(phonemes)
578+
print(f'phoneme seq: {text}')
563579

564580
sequence = []
565581
for symbol in text.split():

0 commit comments

Comments
 (0)