mask loss for padding; redo preprocessing and dataloading to correctly randomize order of concurrent events

victor-shepardson · victor-shepardson · commit 78c2ab9e98ad · 2022-04-15T12:15:37.000Z
diff --git a/notepredictor/notebook/midi.ipynb b/notepredictor/notebook/midi.ipynb
diff --git a/notepredictor/notepredictor/data.py b/notepredictor/notepredictor/data.py
@@ -7,7 +7,7 @@
 from torch.utils.data import Dataset, DataLoader
 
 class MIDIDataset(Dataset):
-    def __init__(self, data_dir, batch_len, transpose=2, speed=0.1, glob='**/*.pkl'):
+    def __init__(self, data_dir, batch_len, transpose=5, speed=0.1, glob='**/*.pkl'):
         #, clamp_time=(-,10)):
         """
         """
@@ -56,15 +56,13 @@ def __getitem__(self, idx):
         transpose = random.randint(-transpose_down, transpose_up)
         pitch = pitch + transpose
 
-        # random speed
-        # delta t of first note?
-        time = time.float()
+
+        time_margin = 1e-3 # hardcoded since it should match prep script
+
+        # dequantize: add noise up to +/- margin
+        time = time + (torch.rand_like(time)*2-1)*time_margin
+        # random augment tempo
         time = time * (1 + random.random()*self.speed*2 - self.speed)
-        # dequantize
-        # TODO: use actual tactus from MIDI file?
-        time = (
-            time + (torch.rand_like(time)-0.5)*2e-3
-            ).clamp(0., float('inf'))
 
         # dequantize velocity
         velocity = velocity.float()
@@ -77,8 +75,18 @@ def __getitem__(self, idx):
         velocity = velocity ** (2**(torch.randn((1,))/3))
         velocity *= 127
 
+        # sort (using argsort on time and indexing the rest)
+        # compute delta time
+        time, idx = time.sort()
+        time = torch.cat((time.new_zeros((1,)), time)).diff(1)
+        program = program[idx]
+        pitch = pitch[idx]
+        velocity = velocity[idx]
+
         # pad with start tokens, zeros
-        pad = max(0, self.batch_len-len(pitch))
+        # always pad with batch_len so that end tokens don't appear in a biased
+        # location
+        pad = self.batch_len-1#max(0, self.batch_len-len(pitch))
         program = torch.cat((
             program.new_full((1,), self.prog_start_token),
             program,
@@ -95,13 +103,13 @@ def __getitem__(self, idx):
             velocity.new_zeros((1,)),
             velocity,
             velocity.new_zeros((pad,))))
-        # end signal: nonzero for last event + padding
+        # end signal: nonzero for last event
         end = torch.zeros_like(program)
         end[-pad-1:] = 1
-
-        mask = torch.zeros_like(program)
+        # compute binary mask for the loss
+        mask = torch.ones_like(program, dtype=torch.bool)
         if pad > 0:
-            mask[-pad:] = 1
+            mask[-pad:] = False
 
         # random slice
         i = random.randint(0, len(pitch)-self.batch_len)
diff --git a/notepredictor/notepredictor/model.py b/notepredictor/notepredictor/model.py
@@ -204,6 +204,11 @@ def __init__(self,
                 p.weight.mul_(1e-2)
             self.end_proj.weight.mul(1e-2)
 
+        # IDEA: instead of this, combine current embeddings (independently) with h via MLPs
+        # stacked along a new final dim
+        # matmul by mask, which is easier (?) to vary per batch/time
+        # (compared to permute-and-cumsum)
+        # then tanh, unbind and more independent MLPs -> dist params
         self.xformer = ModalityTransformer(emb_size, ar_hidden, ar_heads, ar_layers)
 
         # persistent RNN state for inference
@@ -255,14 +260,14 @@ def forward(self, instruments, pitches, times, velocities, ends, validation=Fals
             t.expand(self.rnn.num_layers, x.shape[0], -1).contiguous() # 1 x batch x hidden
             for t in self.initial_state)
         h, _ = self.rnn(x, initial_state) #batch, time, hidden_size
+        h = h[:,:-1] # skip last time position
 
         # fit all note factorizations (e.g. pitch->time->vel vs vel->time->pitch)
         # TODO: perm each batch item independently?
         # get a random ordering for note modalities:
         perm = torch.randperm(self.note_dim)
         # chunk RNN state into Transformer inputs
-        hs = self.h_proj(h[:,:-1]) # skip last time position
-        hs = list(hs.chunk(self.note_dim+1, -1))
+        hs = list(self.h_proj(h).chunk(self.note_dim+1, -1))
         h_ctx = hs[0]
         h_tgt = [hs[i+1] for i in perm]
         # embed ground truth values for teacher-forcing
@@ -294,9 +299,10 @@ def forward(self, instruments, pitches, times, velocities, ends, validation=Fals
         vel_log_probs = vel_result.pop('log_prob')
 
         # end prediction
+        # skip the last position for convenience (so masking is the same)
         end_params = self.end_proj(h)
         end_logits = F.log_softmax(end_params, -1)
-        end_log_probs = end_logits.gather(-1, ends[:,:,None])[...,0]
+        end_log_probs = end_logits.gather(-1, ends[:,:-1,None])[...,0]
 
         r = {
             'end_log_probs': end_log_probs,
diff --git a/notepredictor/scripts/lakh_prep_multitrack.py b/notepredictor/scripts/lakh_prep_multitrack.py
@@ -1,6 +1,7 @@
 from pathlib import Path
 from multiprocessing import Pool
 import functools as ft
+from collections import defaultdict
 import random
 
 from tqdm import tqdm
@@ -16,33 +17,68 @@ def process(fnames):
     except Exception:
         return
 
-    inst_events = []
+    # fix overlapping notes and add a margin for 
+    # dequantization at data loading time
+    time_margin = 1e-3
+
+    events = []
+    # for each instrument
     for inst in mid.instruments:
         inst.remove_invalid_notes()
         program = inst.program + 128*inst.is_drum
-        # NOTE: this will sort concurrent events by pitch
-        # which will introduce some bias when interacting with the model?
-        # e.g. if user plays a note, it will never be harmonized below (or only)
-        # with inexact timing, less frequently
-        # similarly the pitch order would correlate with instrument, i.e. bass 
-        # would usually play first
-        # if anything descending pitch might sound better
-        # could randomize -- even better would be to randomize in dataloader
-        # might be expensive though
-        note_ons = [(n.start, n.pitch, n.velocity, program) for n in inst.notes]
-        note_offs = [(n.end, n.pitch, 0, program) for n in inst.notes]
-        inst_events.extend(note_ons+note_offs)
-    if len(inst_events) < 64:
+        
+        # break out by pitch
+        nbp = defaultdict(list)
+        for n in inst.notes:
+            nbp[n.pitch].append(n)
+            
+        # shorten all notes so they end 2*$margin before next (within pitch)
+        for seq in nbp.values():
+            for i,n in enumerate(seq[:-1]):
+                max_end = seq[i+1].start-2*time_margin
+                if n.end > max_end:
+                    n.end = max_end
+                # and flatten again
+                # converting note offs to 0 velocity
+                # also prevent any note ons from having 0 velocity
+                events.append((n.start, n.pitch, max(1, n.velocity), program))
+                events.append((n.end, n.pitch, 0, program))
+
+    if len(events) < 64:
         return
-    time, pitch, vel, prog = zip(*sorted(inst_events))
-    delta = torch.FloatTensor([0, *time]).diff(1)
+
+    time, pitch, vel, prog = zip(*events)
     torch.save(dict(
-        time=delta, 
+        time=torch.FloatTensor(time), 
         pitch=torch.LongTensor(pitch), 
         velocity=torch.LongTensor(vel),
         program=torch.LongTensor(prog)
     ), g.with_suffix('.pkl'))
 
+
+    #     # NOTE: this will sort concurrent events by pitch
+    #     # which will introduce some bias when interacting with the model?
+    #     # e.g. if user plays a note, it will never be harmonized below (or only)
+    #     # with inexact timing, less frequently
+    #     # similarly the pitch order would correlate with instrument, i.e. bass 
+    #     # would usually play first
+    #     # if anything descending pitch might sound better
+    #     # could randomize -- even better would be to randomize in dataloader
+    #     # might be expensive though
+    #     note_ons = [(n.start, n.pitch, n.velocity, program) for n in inst.notes]
+    #     note_offs = [(n.end, n.pitch, 0, program) for n in inst.notes]
+    #     inst_events.extend(note_ons+note_offs)
+    # if len(inst_events) < 64:
+    #     return
+    # time, pitch, vel, prog = zip(*sorted(inst_events))
+    # delta = torch.FloatTensor([0, *time]).diff(1)
+    # torch.save(dict(
+    #     time=delta, 
+    #     pitch=torch.LongTensor(pitch), 
+    #     velocity=torch.LongTensor(vel),
+    #     program=torch.LongTensor(prog)
+    # ), g.with_suffix('.pkl'))
+
 def main(data_path, dest_path, n_jobs=4):
     data_dir = Path(data_path)
     files = list(data_dir.glob('**/*.mid'))
diff --git a/notepredictor/scripts/train_notes.py b/notepredictor/scripts/train_notes.py
@@ -23,7 +23,6 @@ def __init__(self,
         log_dir,
         data_dir,
         model = None, # dict of model constructor overrides
-        # clamp_time = (0,10), # given to trainer because it needs to go to dataset+model
         batch_size = 128,
         batch_len = 64,
         lr = 3e-4,
@@ -73,7 +72,7 @@ def __init__(self,
 
         # Trainer state
         self.iteration = 0
-        self.exposure = 0
+        self.exposure = 0 # TODO: measure in events, no batch items
         self.epoch = 0
 
         # construct model from arguments 
@@ -139,14 +138,15 @@ def process_grad(self):
                 self.model.parameters(), self.grad_clip, error_if_nonfinite=True)
         return r
 
-    def get_loss_components(self, result):
-        # TODO: masking
+    def get_loss_components(self, result, mask):
+        def reduce(k):
+            return result[k].masked_select(mask).mean()
         return {
-            'instrument_nll': -result['instrument_log_probs'].mean(),
-            'pitch_nll': -result['pitch_log_probs'].mean(),
-            'time_nll': -result['time_log_probs'].mean(),
-            'velocity_nll': -result['velocity_log_probs'].mean(),
-            'end_nll': -result['end_log_probs'].mean()
+            'instrument_nll': -reduce('instrument_log_probs'),
+            'pitch_nll': -reduce('pitch_log_probs'),
+            'time_nll': -reduce('time_log_probs'),
+            'velocity_nll': -reduce('velocity_log_probs'),
+            'end_nll': -reduce('end_log_probs'),
         }
 
     def train(self):
@@ -165,6 +165,7 @@ def validate():
             metrics = defaultdict(float)
             self.model.eval()
             for batch in tqdm(valid_loader, desc=f'validating epoch {self.epoch}'):
+                mask = batch['mask'].to(self.device, non_blocking=True)[...,1:]
                 end = batch['end'].to(self.device, non_blocking=True)
                 inst = batch['instrument'].to(self.device, non_blocking=True)
                 pitch = batch['pitch'].to(self.device, non_blocking=True)
@@ -173,18 +174,19 @@ def validate():
                 with torch.no_grad():
                     result = self.model(
                         inst, pitch, time, vel, end, validation=True)
-                    losses = {k:v.item() for k,v in self.get_loss_components(result).items()}
+                    losses = {k:v.item() for k,v in self.get_loss_components(
+                        result, mask).items()}
                     metrics['loss'] += sum(losses.values())
                     for k,v in losses.items():
                         metrics[k] += v
                     metrics['instrument_acc'] += (result['instrument_log_probs']
-                        .exp().mean().item())
+                        .masked_select(mask).exp().mean().item())
                     metrics['pitch_acc'] += (result['pitch_log_probs']
-                        .exp().mean().item())
+                        .masked_select(mask).exp().mean().item())
                     metrics['time_acc_30ms'] += (result['time_acc_30ms']
-                        .mean().item())
+                        .masked_select(mask).mean().item())
                     metrics['velocity_acc'] += (result['velocity_log_probs']
-                        .exp().mean().item())
+                        .masked_select(mask).exp().mean().item())
             self.log('valid', {k:v/len(valid_loader) for k,v in metrics.items()})
 
         epoch_size = self.epoch_size or len(train_loader)
@@ -199,29 +201,33 @@ def validate():
             self.model.train()
             for batch in tqdm(it.islice(train_loader, epoch_size), 
                     desc=f'training epoch {self.epoch}', total=epoch_size):
-
+                mask = batch['mask'].to(self.device, non_blocking=True)
                 end = batch['end'].to(self.device, non_blocking=True)
                 inst = batch['instrument'].to(self.device, non_blocking=True)
                 pitch = batch['pitch'].to(self.device, non_blocking=True)
                 time = batch['time'].to(self.device, non_blocking=True)
                 vel = batch['velocity'].to(self.device, non_blocking=True)
 
                 self.iteration += 1
-                self.exposure += self.batch_size
-
+                self.exposure += self.batch_size # * self.batch_len
                 logs = {}
 
+                ### forward+backward+optimizer step ###
                 self.opt.zero_grad()
                 result = self.model(inst, pitch, time, vel, end)
-                losses = self.get_loss_components(result)
+                losses = self.get_loss_components(result, mask[...,1:])
                 loss = sum(losses.values())
                 loss.backward()
                 logs |= self.process_grad()
                 self.opt.step()
+                ########
 
+                # log loss components
                 logs |= {k:v.item() for k,v in losses.items()}
-                logs |= {k:v.item() for k,v in result.items() if v.numel()==1}
+                # log total loss
                 logs |= {'loss':loss.item()}
+                # log any other returned scalars
+                logs |= {k:v.item() for k,v in result.items() if v.numel()==1}
                 self.log('train', logs)
 
             validate()