Skip to content

Commit e2d9469

Browse files
authored
gradient accumulation, mel-generation, bugfixes by @Rick-McCoy
* backup 0826 * backup * revise tier splitting * try tier1 first * update gitignore * update * add unblizzard.yaml * now train runs * parallelize rnn ops * fix biGRU * fix shape * fix pad * fix tier split * use central stack only at initial tier * add adam optimizer * view contiguous * fix tierutil, centralized stack * gradient accumulation, mel-generation, bugfixes * Update gitignore. * Extremely minor update. * don't use gmm * train each tiers separately * validation * now all tier works * batch size to args * a bit hard coded constants * pad at end, some fixes * Remove tensorboardX. * Parallelize mel spec generation. * use LSTM instead of LSTM * Removed dependency on tensorboardX. * normalize spectrograms * fix norm code * fix central stack residual connection * Add upsample.py * add model/upsample.py * Added VoxCeleb2 & minor changes. * Changed reading dataset from scipy to pydub Now supports almost all formats * Fixed bug in which .cuda() was done to np.ndarray * .cuda() cannot be called in dataloader. * Wrong function name * Removed wrongly placed .cuda()s in MelGen * Removed more misplaced .cuda()s in audio.py * cut_divide_tiers() accounted for nonexistent batch * Fixed loading in validation.py * Update .gitignore for dataset * Add gmm.py * Fixed minor bug * Fixed bugs. * Added SGD optimizer. * Fixed broadcasting bug * More bug fixing: .cuda() in get_pi_indices * Added reshape_as to sample_gmm * Changed indices to numpy array * Fix bug in sample_gmm * Fix typo in validation * More bug fixing in gmm.py * Fixed an overlooked bug * Added mu, std, pi to log_training * Unconditional w/ GMM, SGD lr=1e-4 (#5) * backup 0826 * backup * revise tier splitting * try tier1 first * update gitignore * update * add unblizzard.yaml * now train runs * parallelize rnn ops * fix biGRU * fix shape * fix pad * fix tier split * use central stack only at initial tier * add adam optimizer * view contiguous * Update gitignore. * fix tierutil, centralized stack * Extremely minor update. * don't use gmm * train each tiers separately * validation * now all tier works * batch size to args * a bit hard coded constants * pad at end, some fixes * Remove tensorboardX. * Parallelize mel spec generation. * use LSTM instead of LSTM * Removed dependency on tensorboardX. * normalize spectrograms * fix norm code * fix central stack residual connection * Add upsample.py * add model/upsample.py * Added VoxCeleb2 & minor changes. * Changed reading dataset from scipy to pydub Now supports almost all formats * Fixed bug in which .cuda() was done to np.ndarray * .cuda() cannot be called in dataloader. * Wrong function name * Removed wrongly placed .cuda()s in MelGen * Removed more misplaced .cuda()s in audio.py * cut_divide_tiers() accounted for nonexistent batch * Fixed loading in validation.py * Update .gitignore for dataset * Add gmm.py * Fixed minor bug * Fixed bugs. * Added SGD optimizer. * Fixed broadcasting bug * More bug fixing: .cuda() in get_pi_indices * Added reshape_as to sample_gmm * Changed indices to numpy array * Fix bug in sample_gmm * Fix typo in validation * More bug fixing in gmm.py * Fixed an overlooked bug * Added mu, std, pi to log_training * Integrate upsample.py into tiers 2~N. * Fixed bug in dataloader Changed torch.stft to librosa Added gradient accumulation * Minor bug fix. * Changed MelGen to only deal with numpy * Changed amp_to_db to power_to_db. * Minor bug fix. * Fixed major bug m4a raw data is in 16 bit format, not 8 bit * dtype error in numpy * Update README.md
1 parent bcb2922 commit e2d9469

File tree

10 files changed

+42
-58
lines changed

10 files changed

+42
-58
lines changed

README.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -32,4 +32,4 @@ Implementation of [MelNet: A Generative Model for Audio in the Frequency Domain]
3232

3333
## License
3434

35-
MIT License
35+
MIT License

config/voxceleb2.yaml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@ train:
2323
optimizer: 'SGD'
2424
sgd:
2525
lr: 0.0001
26+
update_interval: 128
2627
---
2728
log:
2829
summary_interval: 1

datasets/wavloader.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -39,12 +39,12 @@ def __init__(self, hp, args, train):
3939
random.seed(123)
4040
random.shuffle(self.file_list)
4141
if train:
42-
self.file_list = self.file_list[:int(0.95*len(self.file_list))]
42+
self.file_list = self.file_list[:int(0.95 * len(self.file_list))]
4343
else:
44-
self.file_list = self.file_list[int(0.95*len(self.file_list)):]
44+
self.file_list = self.file_list[int(0.95 * len(self.file_list)):]
4545

4646
self.wavlen = int(hp.audio.sr * hp.audio.duration)
47-
self.tier = 0
47+
self.tier = self.args.tier
4848

4949
self.melgen = MelGen(hp)
5050
self.tierutil = TierUtil(hp)
@@ -54,7 +54,7 @@ def __len__(self):
5454

5555
def __getitem__(self, idx):
5656
wav = read_wav_np(self.file_list[idx])
57-
wav = torch.from_numpy(cut_wav(self.wavlen, wav))
57+
wav = cut_wav(self.wavlen, wav)
5858
mel = self.melgen.get_normalized_mel(wav)
5959
source, target = self.tierutil.cut_divide_tiers(mel, self.tier)
6060

model/tier.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -32,7 +32,6 @@ def __init__(self, hp, freq, layers, tierN):
3232
self.pi_softmax = nn.Softmax(dim=3)
3333

3434
# map output to produce GMM parameter eq. (10)
35-
# temporarily don't use GMM. Instead, directly estimate value
3635
self.W_theta = nn.Linear(num_hidden, 30)
3736

3837
def forward(self, x):

utils/audio.py

Lines changed: 10 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -1,52 +1,31 @@
1-
# based on https://github.com/keithito/tacotron/blob/master/util/audio.py
2-
3-
import torch
41
import librosa
52
import numpy as np
63

74

85
class MelGen():
96
def __init__(self, hp):
107
self.hp = hp
11-
self.window = torch.hann_window(window_length=hp.audio.win_length)
12-
self.mel_basis = librosa.filters.mel(
13-
sr=hp.audio.sr, n_fft=hp.audio.n_fft, n_mels=hp.audio.n_mels)
14-
self.mel_basis = \
15-
torch.from_numpy(self.mel_basis).float() # [n_mels, n_fft//2+1]
168

17-
def get_magnitude(self, x):
18-
x = torch.stft(x,
9+
def get_normalized_mel(self, x):
10+
x = librosa.feature.melspectrogram(
11+
y=x,
12+
sr=self.hp.audio.sr,
1913
n_fft=self.hp.audio.n_fft,
2014
hop_length=self.hp.audio.hop_length,
2115
win_length=self.hp.audio.win_length,
22-
window=self.window)
23-
mag = torch.norm(x, p=2, dim=-1)
24-
return mag # [B, n_fft//2+1, T]
25-
26-
def get_mel(self, x):
27-
mag = self.get_magnitude(x)
28-
mel = torch.matmul(self.mel_basis, mag)
29-
return mel # [B, n_mels, T]
30-
31-
def get_normalized_mel(self, x):
32-
x = self.get_mel(x)
16+
n_mels=self.hp.audio.n_mels
17+
)
3318
x = self.pre_spec(x)
3419
return x
3520

3621
def pre_spec(self, x):
37-
return self.normalize(self.amp_to_db(x) - self.hp.audio.ref_level_db)
22+
return self.normalize(librosa.power_to_db(x) - self.hp.audio.ref_level_db)
3823

3924
def post_spec(self, x):
40-
return self.db_to_amp(self.denormalize(x) + self.hp.audio.ref_level_db)
41-
42-
def amp_to_db(self, x):
43-
return 20.0 * torch.log10(torch.max(x, torch.tensor(1e-6)))
25+
return librosa.db_to_power(self.denormalize(x) + self.hp.audio.ref_level_db)
4426

4527
def normalize(self, x):
46-
return torch.clamp(x / -self.hp.audio.min_level_db, -1.0, 0.0) + 1.0
47-
48-
def db_to_amp(self, x):
49-
return torch.pow(10.0, 0.05*x)
28+
return np.clip(x / -self.hp.audio.min_level_db, -1.0, 0.0) + 1.0
5029

5130
def denormalize(self, x):
52-
return (torch.clamp(x, 0.0, 1.0) - 1.0) * -self.hp.audio.min_level_db
31+
return (np.clip(x, 0.0, 1.0) - 1.0) * -self.hp.audio.min_level_db

utils/gmm.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2,9 +2,9 @@
22
import numpy as np
33

44
def get_pi_indices(pi):
5-
cumsum = torch.cumsum(pi.cpu(), dim=3)
5+
cumsum = torch.cumsum(pi.cpu(), dim=-1)
66
rand = torch.rand(pi.shape[:-1] + (1,))
7-
indices = (cumsum < rand).sum(dim=3)
7+
indices = (cumsum < rand).sum(dim=-1)
88
return indices.flatten().detach().numpy()
99

1010
def sample_gmm(mu, std, pi):

utils/tierutil.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -19,8 +19,8 @@ def __init__(self, hp):
1919
# 10*16000 // 180 + 1 = 889 (tedlium3)
2020

2121
def cut_divide_tiers(self, x, tierNo):
22-
x = x[:, :-(x.size(-1) % self.t_div)]
23-
M, T = x.size()
22+
x = x[:, :-(x.shape[-1] % self.t_div)]
23+
M, T = x.shape
2424
assert M % self.f_div == 0, \
2525
'freq(mel) dimension should be divisible by %d, got %d.' \
2626
% (self.f_div, M)
@@ -40,6 +40,6 @@ def cut_divide_tiers(self, x, tierNo):
4040

4141
# return source, target
4242
if tierNo == 1:
43-
return tiers[-1], tiers[-1]
43+
return tiers[-1], tiers[-1].copy()
4444
else:
4545
return tiers[-2], tiers[-1]

utils/train.py

Lines changed: 18 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -19,10 +19,12 @@
1919

2020

2121
def train(args, pt_dir, chkpt_path, trainloader, testloader, writer, logger, hp, hp_str):
22-
model = Tier(hp=hp,
23-
freq=hp.audio.n_mels // f_div[hp.model.tier+1] * f_div[args.tier],
24-
layers=hp.model.layers[args.tier-1],
25-
tierN=args.tier).cuda()
22+
model = Tier(
23+
hp=hp,
24+
freq=hp.audio.n_mels // f_div[hp.model.tier+1] * f_div[args.tier],
25+
layers=hp.model.layers[args.tier-1],
26+
tierN=args.tier
27+
).cuda()
2628
melgen = MelGen(hp)
2729
tierutil = TierUtil(hp)
2830
criterion = GMMLoss()
@@ -74,27 +76,30 @@ def train(args, pt_dir, chkpt_path, trainloader, testloader, writer, logger, hp,
7476
torch.backends.cudnn.benchmark = True
7577
try:
7678
model.train()
79+
optimizer.zero_grad()
80+
loss_sum = 0
7781
for epoch in itertools.count(init_epoch+1):
78-
trainloader.tier = args.tier
7982
loader = tqdm(trainloader, desc='Train data loader')
8083
for source, target in loader:
8184
mu, std, pi = model(source.cuda())
8285
loss = criterion(target.cuda(), mu, std, pi)
83-
84-
optimizer.zero_grad()
85-
loss.backward()
86-
optimizer.step()
8786
step += 1
87+
(loss / hp.train.update_interval).backward()
88+
loss_sum += loss.item() / hp.train.update_interval
89+
90+
if step % hp.train.update_interval == 0:
91+
optimizer.step()
92+
optimizer.zero_grad()
93+
if step % hp.log.summary_interval == 0:
94+
writer.log_training(loss_sum, mu, std, pi, step)
95+
loader.set_description("Loss %.04f at step %d" % (loss_sum, step))
96+
loss_sum = 0
8897

8998
loss = loss.item()
9099
if loss > 1e8 or math.isnan(loss):
91100
logger.error("Loss exploded to %.04f at step %d!" % (loss, step))
92101
raise Exception("Loss exploded")
93102

94-
if step % hp.log.summary_interval == 0:
95-
writer.log_training(loss, mu, std, pi, step)
96-
loader.set_description("Loss %.04f at step %d" % (loss, step))
97-
98103
save_path = os.path.join(pt_dir, '%s_%s_tier%d_%03d.pt'
99104
% (args.name, githash, args.tier, epoch))
100105
torch.save({

utils/utils.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@ def read_wav_np(wavpath):
1313
file_format = wavpath.split('.')[-1]
1414
audio = AudioSegment.from_file(wavpath, file_format)
1515
data = audio.raw_data
16-
wav = np.frombuffer(data, dtype=np.uint8)
16+
wav = np.frombuffer(data, dtype=np.int16)
1717

1818
if len(wav.shape) == 2:
1919
wav = wav[:, 0]
@@ -37,6 +37,6 @@ def cut_wav(L, wav):
3737
'constant', constant_values=0.0)
3838
else:
3939
start = random.randint(0, samples - L)
40-
wav = wav[start:start+L]
40+
wav = wav[start:start + L]
4141

4242
return wav

utils/validation.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,7 @@ def validate(args, model, melgen, tierutil, testloader, criterion, writer, step)
2222
test_loss /= len(testloader.dataset)
2323
source = source[0].cpu().detach().numpy()
2424
target = target[0].cpu().detach().numpy()
25-
result = sample_gmm(mu, std, pi)[0].cpu().detach().numpy()
25+
result = sample_gmm(mu[0], std[0], pi[0]).cpu().detach().numpy()
2626
writer.log_validation(test_loss, source, target, result, step)
2727

2828
model.train()

0 commit comments

Comments
 (0)