gradient accumulation, mel-generation, bugfixes by @Rick-McCoy

seungwonpark · web-flow · commit e2d94696cb16 · 2019-10-16T14:30:39.000+09:00
* backup 0826 * backup * revise tier splitting * try tier1 first * update gitignore * update * add unblizzard.yaml * now train runs * parallelize rnn ops * fix biGRU * fix shape * fix pad * fix tier split * use central stack only at initial tier * add adam optimizer * view contiguous * fix tierutil, centralized stack * gradient accumulation, mel-generation, bugfixes * Update gitignore. * Extremely minor update. * don't use gmm * train each tiers separately * validation * now all tier works * batch size to args * a bit hard coded constants * pad at end, some fixes * Remove tensorboardX. * Parallelize mel spec generation. * use LSTM instead of LSTM * Removed dependency on tensorboardX. * normalize spectrograms * fix norm code * fix central stack residual connection * Add upsample.py * add model/upsample.py * Added VoxCeleb2 & minor changes. * Changed reading dataset from scipy to pydub Now supports almost all formats * Fixed bug in which .cuda() was done to np.ndarray * .cuda() cannot be called in dataloader. * Wrong function name * Removed wrongly placed .cuda()s in MelGen * Removed more misplaced .cuda()s in audio.py * cut_divide_tiers() accounted for nonexistent batch * Fixed loading in validation.py * Update .gitignore for dataset * Add gmm.py * Fixed minor bug * Fixed bugs. * Added SGD optimizer. * Fixed broadcasting bug * More bug fixing: .cuda() in get_pi_indices * Added reshape_as to sample_gmm * Changed indices to numpy array * Fix bug in sample_gmm * Fix typo in validation * More bug fixing in gmm.py * Fixed an overlooked bug * Added mu, std, pi to log_training * Unconditional w/ GMM, SGD lr=1e-4 (#5) * backup 0826 * backup * revise tier splitting * try tier1 first * update gitignore * update * add unblizzard.yaml * now train runs * parallelize rnn ops * fix biGRU * fix shape * fix pad * fix tier split * use central stack only at initial tier * add adam optimizer * view contiguous * Update gitignore. * fix tierutil, centralized stack * Extremely minor update. * don't use gmm * train each tiers separately * validation * now all tier works * batch size to args * a bit hard coded constants * pad at end, some fixes * Remove tensorboardX. * Parallelize mel spec generation. * use LSTM instead of LSTM * Removed dependency on tensorboardX. * normalize spectrograms * fix norm code * fix central stack residual connection * Add upsample.py * add model/upsample.py * Added VoxCeleb2 & minor changes. * Changed reading dataset from scipy to pydub Now supports almost all formats * Fixed bug in which .cuda() was done to np.ndarray * .cuda() cannot be called in dataloader. * Wrong function name * Removed wrongly placed .cuda()s in MelGen * Removed more misplaced .cuda()s in audio.py * cut_divide_tiers() accounted for nonexistent batch * Fixed loading in validation.py * Update .gitignore for dataset * Add gmm.py * Fixed minor bug * Fixed bugs. * Added SGD optimizer. * Fixed broadcasting bug * More bug fixing: .cuda() in get_pi_indices * Added reshape_as to sample_gmm * Changed indices to numpy array * Fix bug in sample_gmm * Fix typo in validation * More bug fixing in gmm.py * Fixed an overlooked bug * Added mu, std, pi to log_training * Integrate upsample.py into tiers 2~N. * Fixed bug in dataloader Changed torch.stft to librosa Added gradient accumulation * Minor bug fix. * Changed MelGen to only deal with numpy * Changed amp_to_db to power_to_db. * Minor bug fix. * Fixed major bug m4a raw data is in 16 bit format, not 8 bit * dtype error in numpy * Update README.md
diff --git a/README.md b/README.md
@@ -32,4 +32,4 @@ Implementation of [MelNet: A Generative Model for Audio in the Frequency Domain]
 
 ## License
 
-MIT License
+MIT License
diff --git a/config/voxceleb2.yaml b/config/voxceleb2.yaml
@@ -23,6 +23,7 @@ train:
   optimizer: 'SGD'
   sgd:
     lr: 0.0001
+  update_interval: 128
 ---
 log:
   summary_interval: 1
diff --git a/datasets/wavloader.py b/datasets/wavloader.py
@@ -39,12 +39,12 @@ def __init__(self, hp, args, train):
         random.seed(123)
         random.shuffle(self.file_list)
         if train:
-            self.file_list = self.file_list[:int(0.95*len(self.file_list))]
+            self.file_list = self.file_list[:int(0.95 * len(self.file_list))]
         else:
-            self.file_list = self.file_list[int(0.95*len(self.file_list)):]
+            self.file_list = self.file_list[int(0.95 * len(self.file_list)):]
 
         self.wavlen = int(hp.audio.sr * hp.audio.duration)
-        self.tier = 0
+        self.tier = self.args.tier
 
         self.melgen = MelGen(hp)
         self.tierutil = TierUtil(hp)
@@ -54,7 +54,7 @@ def __len__(self):
 
     def __getitem__(self, idx):
         wav = read_wav_np(self.file_list[idx])
-        wav = torch.from_numpy(cut_wav(self.wavlen, wav))
+        wav = cut_wav(self.wavlen, wav)
         mel = self.melgen.get_normalized_mel(wav)
         source, target = self.tierutil.cut_divide_tiers(mel, self.tier)
 
diff --git a/model/tier.py b/model/tier.py
@@ -32,7 +32,6 @@ def __init__(self, hp, freq, layers, tierN):
         self.pi_softmax = nn.Softmax(dim=3)
 
         # map output to produce GMM parameter eq. (10)
-        # temporarily don't use GMM. Instead, directly estimate value
         self.W_theta = nn.Linear(num_hidden, 30)
 
     def forward(self, x):
diff --git a/utils/audio.py b/utils/audio.py
@@ -1,52 +1,31 @@
-# based on https://github.com/keithito/tacotron/blob/master/util/audio.py
-
-import torch
 import librosa
 import numpy as np
 
 
 class MelGen():
     def __init__(self, hp):
         self.hp = hp
-        self.window = torch.hann_window(window_length=hp.audio.win_length)
-        self.mel_basis = librosa.filters.mel(
-            sr=hp.audio.sr, n_fft=hp.audio.n_fft, n_mels=hp.audio.n_mels)
-        self.mel_basis = \
-            torch.from_numpy(self.mel_basis).float() # [n_mels, n_fft//2+1]
 
-    def get_magnitude(self, x):
-        x = torch.stft(x,
+    def get_normalized_mel(self, x):
+        x = librosa.feature.melspectrogram(
+            y=x,
+            sr=self.hp.audio.sr,
             n_fft=self.hp.audio.n_fft,
             hop_length=self.hp.audio.hop_length,
             win_length=self.hp.audio.win_length,
-            window=self.window)
-        mag = torch.norm(x, p=2, dim=-1)
-        return mag # [B, n_fft//2+1, T]
-
-    def get_mel(self, x):
-        mag = self.get_magnitude(x)
-        mel = torch.matmul(self.mel_basis, mag)
-        return mel # [B, n_mels, T]
-
-    def get_normalized_mel(self, x):
-        x = self.get_mel(x)
+            n_mels=self.hp.audio.n_mels
+        )
         x = self.pre_spec(x)
         return x
 
     def pre_spec(self, x):
-        return self.normalize(self.amp_to_db(x) - self.hp.audio.ref_level_db)
+        return self.normalize(librosa.power_to_db(x) - self.hp.audio.ref_level_db)
 
     def post_spec(self, x):
-        return self.db_to_amp(self.denormalize(x) + self.hp.audio.ref_level_db)
-
-    def amp_to_db(self, x):
-        return 20.0 * torch.log10(torch.max(x, torch.tensor(1e-6)))
+        return librosa.db_to_power(self.denormalize(x) + self.hp.audio.ref_level_db)
 
     def normalize(self, x):
-        return torch.clamp(x / -self.hp.audio.min_level_db, -1.0, 0.0) + 1.0
-
-    def db_to_amp(self, x):
-        return torch.pow(10.0, 0.05*x)
+        return np.clip(x / -self.hp.audio.min_level_db, -1.0, 0.0) + 1.0
 
     def denormalize(self, x):
-        return (torch.clamp(x, 0.0, 1.0) - 1.0) * -self.hp.audio.min_level_db
+        return (np.clip(x, 0.0, 1.0) - 1.0) * -self.hp.audio.min_level_db
diff --git a/utils/gmm.py b/utils/gmm.py
@@ -2,9 +2,9 @@
 import numpy as np
 
 def get_pi_indices(pi):
-    cumsum = torch.cumsum(pi.cpu(), dim=3)
+    cumsum = torch.cumsum(pi.cpu(), dim=-1)
     rand = torch.rand(pi.shape[:-1] + (1,))
-    indices = (cumsum < rand).sum(dim=3)
+    indices = (cumsum < rand).sum(dim=-1)
     return indices.flatten().detach().numpy()
 
 def sample_gmm(mu, std, pi):
diff --git a/utils/tierutil.py b/utils/tierutil.py
@@ -19,8 +19,8 @@ def __init__(self, hp):
         # 10*16000 // 180 + 1 = 889 (tedlium3)        
 
     def cut_divide_tiers(self, x, tierNo):
-        x = x[:, :-(x.size(-1) % self.t_div)]
-        M, T = x.size()
+        x = x[:, :-(x.shape[-1] % self.t_div)]
+        M, T = x.shape
         assert M % self.f_div == 0, \
             'freq(mel) dimension should be divisible by %d, got %d.' \
             % (self.f_div, M)
@@ -40,6 +40,6 @@ def cut_divide_tiers(self, x, tierNo):
 
         # return source, target
         if tierNo == 1:
-            return tiers[-1], tiers[-1]
+            return tiers[-1], tiers[-1].copy()
         else:
             return tiers[-2], tiers[-1]
diff --git a/utils/train.py b/utils/train.py
@@ -19,10 +19,12 @@
 
 
 def train(args, pt_dir, chkpt_path, trainloader, testloader, writer, logger, hp, hp_str):
-    model = Tier(hp=hp,
-                freq=hp.audio.n_mels // f_div[hp.model.tier+1] * f_div[args.tier],
-                layers=hp.model.layers[args.tier-1],
-                tierN=args.tier).cuda()
+    model = Tier(
+        hp=hp,
+        freq=hp.audio.n_mels // f_div[hp.model.tier+1] * f_div[args.tier],
+        layers=hp.model.layers[args.tier-1],
+        tierN=args.tier
+    ).cuda()
     melgen = MelGen(hp)
     tierutil = TierUtil(hp)
     criterion = GMMLoss()
@@ -74,27 +76,30 @@ def train(args, pt_dir, chkpt_path, trainloader, testloader, writer, logger, hp,
     torch.backends.cudnn.benchmark = True
     try:
         model.train()
+        optimizer.zero_grad()
+        loss_sum = 0
         for epoch in itertools.count(init_epoch+1):
-            trainloader.tier = args.tier
             loader = tqdm(trainloader, desc='Train data loader')
             for source, target in loader:
                 mu, std, pi = model(source.cuda())
                 loss = criterion(target.cuda(), mu, std, pi)
-                
-                optimizer.zero_grad()
-                loss.backward()
-                optimizer.step()
                 step += 1
+                (loss / hp.train.update_interval).backward()
+                loss_sum += loss.item() / hp.train.update_interval
+
+                if step % hp.train.update_interval == 0:
+                    optimizer.step()
+                    optimizer.zero_grad()
+                    if step % hp.log.summary_interval == 0:
+                        writer.log_training(loss_sum, mu, std, pi, step)
+                        loader.set_description("Loss %.04f at step %d" % (loss_sum, step))
+                    loss_sum = 0
 
                 loss = loss.item()
                 if loss > 1e8 or math.isnan(loss):
                     logger.error("Loss exploded to %.04f at step %d!" % (loss, step))
                     raise Exception("Loss exploded")
 
-                if step % hp.log.summary_interval == 0:
-                    writer.log_training(loss, mu, std, pi, step)
-                    loader.set_description("Loss %.04f at step %d" % (loss, step))
-
             save_path = os.path.join(pt_dir, '%s_%s_tier%d_%03d.pt'
                 % (args.name, githash, args.tier, epoch))
             torch.save({
diff --git a/utils/utils.py b/utils/utils.py
@@ -13,7 +13,7 @@ def read_wav_np(wavpath):
     file_format = wavpath.split('.')[-1]
     audio = AudioSegment.from_file(wavpath, file_format)
     data = audio.raw_data
-    wav = np.frombuffer(data, dtype=np.uint8)
+    wav = np.frombuffer(data, dtype=np.int16)
     
     if len(wav.shape) == 2:
         wav = wav[:, 0]
@@ -37,6 +37,6 @@ def cut_wav(L, wav):
                 'constant', constant_values=0.0)
     else:
         start = random.randint(0, samples - L)
-        wav = wav[start:start+L]
+        wav = wav[start:start + L]
 
     return wav
diff --git a/utils/validation.py b/utils/validation.py
@@ -22,7 +22,7 @@ def validate(args, model, melgen, tierutil, testloader, criterion, writer, step)
         test_loss /= len(testloader.dataset)
         source = source[0].cpu().detach().numpy()
         target = target[0].cpu().detach().numpy()
-        result = sample_gmm(mu, std, pi)[0].cpu().detach().numpy()
+        result = sample_gmm(mu[0], std[0], pi[0]).cpu().detach().numpy()
         writer.log_validation(test_loss, source, target, result, step)
 
     model.train()

Original file line number	Diff line number	Diff line change
`@@ -32,4 +32,4 @@ Implementation of [MelNet: A Generative Model for Audio in the Frequency Domain]`
`32`	`32`
`33`	`33`	`## License`
`34`	`34`
`35`		`-MIT License`
	`35`	`+MIT License`