🌱 Add duration to mask exporter, modify Tacotron2 and dataloader to accept

ZDisket · ZDisket · commit 221f1cd226a0 · 2021-09-16T00:23:11.000-03:00
diff --git a/examples/tacotron2/export_align.py b/examples/tacotron2/export_align.py
@@ -0,0 +1,169 @@
+import os
+import shutil
+from tqdm import tqdm
+import argparse
+
+from scipy.ndimage import zoom
+from skimage.data import camera
+import numpy as np
+from scipy.spatial.distance import cdist
+
+def safemkdir(dirn):
+    if not os.path.isdir(dirn):
+        os.mkdir(dirn)
+        
+from pathlib import Path
+
+def duration_to_alignment(in_duration):
+    total_len = np.sum(in_duration)
+    num_chars = len(in_duration)
+
+    attention = np.zeros(shape=(num_chars,total_len),dtype=np.float32)
+    y_offset = 0
+
+    for duration_idx, duration_val in enumerate(in_duration):
+        for y_val in range(0,duration_val):
+            attention[duration_idx][y_offset + y_val] = 1.0
+        
+        y_offset += duration_val
+    
+    return attention
+
+
+def rescale_alignment(in_alignment,in_targcharlen):
+    current_x = in_alignment.shape[0]
+    x_ratio = in_targcharlen / current_x
+    pivot_points = []
+    
+    zoomed = zoom(in_alignment,(x_ratio,1.0),mode="nearest")
+
+    for x_v in range(0,zoomed.shape[0]):
+        for y_v in range(0,zoomed.shape[1]):
+            val = zoomed[x_v][y_v]
+            if val < 0.5:
+                val = 0.0
+            else:
+                val = 1.0
+                pivot_points.append( (x_v,y_v) )
+
+            zoomed[x_v][y_v] = val
+            
+    
+    if zoomed.shape[0] != in_targcharlen:
+        print("Zooming didn't rshape well, explicitly reshaping")
+        zoomed.resize((in_targcharlen,in_alignment.shape[1]))
+
+    return zoomed, pivot_points
+
+
+def gather_dist(in_mtr,in_points):
+    #initialize with known size for fast
+    full_coords = [(0,0) for x in range(in_mtr.shape[0] * in_mtr.shape[1])]
+    i = 0
+    for x in range(0, in_mtr.shape[0]):
+        for y in range(0, in_mtr.shape[1]):
+            full_coords[i] = (x,y)
+            i += 1
+    
+    return cdist(full_coords, in_points,"euclidean")
+        
+        
+
+
+def create_guided(in_align,in_pvt,looseness):
+    new_att = np.ones(in_align.shape,dtype=np.float32)
+    # It is dramatically faster that we first gather all the points and calculate than do it manually
+    # for each point in for loop
+    dist_arr = gather_dist(in_align,in_pvt)
+    # Scale looseness based on attention size. (addition works better than mul). Also divide by 100
+    # because having user input 3.35 is nicer
+    real_loose = (looseness / 100) * (new_att.shape[0] + new_att.shape[1])
+    g_idx = 0
+    for x in range(0, new_att.shape[0]):
+        for y in range(0, new_att.shape[1]):
+            min_point_idx = dist_arr[g_idx].argmin()
+
+            closest_pvt = in_pvt[min_point_idx]
+            distance = dist_arr[g_idx][min_point_idx] / real_loose
+            distance = np.power(distance,2) 
+
+            g_idx += 1
+            
+            new_att[x,y] = distance
+
+    return np.clip(new_att,0.0,1.0)
+
+def get_pivot_points(in_att):
+    ret_points = []
+    for x in range(0, in_att.shape[0]):
+        for y in range(0, in_att.shape[1]):
+            if in_att[x,y] > 0.8:
+                ret_points.append((x,y))
+    return ret_points
+
+def main():
+    parser = argparse.ArgumentParser(description="Postprocess durations to become alignments")
+    parser.add_argument(
+      "--dump-dir",
+      default="dump",
+      type=str,
+      help="Path of dump directory",
+    )
+    parser.add_argument(
+      "--looseness",
+      default=3.5,
+      type=float,
+      help="Looseness of the generated guided attention map. Lower values = tighter",
+    )
+    args = parser.parse_args()
+    dump_dir = args.dump_dir
+    dump_sets = ["train","valid"]
+
+    for d_set in dump_sets:
+        full_fol = os.path.join(dump_dir,d_set)
+        align_path = os.path.join(full_fol,"alignments")
+
+        ids_path = os.path.join(full_fol,"ids")
+        durations_path = os.path.join(full_fol,"durations")
+
+        safemkdir(align_path)
+
+        for duration_fn in tqdm(os.listdir(durations_path)):
+            if not ".npy" in duration_fn:
+                 continue
+        
+            id_fn = duration_fn.replace("-durations","-ids")
+
+            id_path = os.path.join(ids_path,id_fn)
+            duration_path = os.path.join(durations_path,duration_fn)
+
+            duration_arr = np.load(duration_path)
+            id_arr = np.load(id_path)
+
+            id_true_size = len(id_arr)
+
+            align = duration_to_alignment(duration_arr)
+
+            if align.shape[0] != id_true_size:
+                align, points = rescale_alignment(align,id_true_size)
+            else:
+                points = get_pivot_points(align)
+            
+            if len(points) == 0:
+                print("WARNING points are empty for",id_fn)
+
+            align = create_guided(align,points,args.looseness)
+
+            
+            align_fn = id_fn.replace("-ids","-alignment")
+            align_full_fn = os.path.join(align_path,align_fn)
+            
+            np.save(align_full_fn,align.astype("float32"))
+        
+
+
+
+
+if __name__ == "__main__":
+    main()
+
diff --git a/examples/tacotron2/tacotron_dataset.py b/examples/tacotron2/tacotron_dataset.py
@@ -35,6 +35,7 @@ def __init__(
         root_dir,
         charactor_query="*-ids.npy",
         mel_query="*-norm-feats.npy",
+        align_query="",
         charactor_load_fn=np.load,
         mel_load_fn=np.load,
         mel_length_threshold=0,
@@ -52,6 +53,7 @@ def __init__(
             charactor_query (str): Query to find charactor files in root_dir.
             mel_query (str): Query to find feature files in root_dir.
             charactor_load_fn (func): Function to load charactor file.
+            align_query (str): Query to find FAL files in root_dir. If empty, we use stock guided attention loss
             mel_load_fn (func): Function to load feature file.
             mel_length_threshold (int): Threshold to remove short feature files.
             reduction_factor (int): Reduction factor on Tacotron-2 paper.
@@ -67,6 +69,8 @@ def __init__(
         # find all of charactor and mel files.
         charactor_files = sorted(find_files(root_dir, charactor_query))
         mel_files = sorted(find_files(root_dir, mel_query))
+
+
         mel_lengths = [mel_load_fn(f).shape[0] for f in mel_files]
         char_lengths = [charactor_load_fn(f).shape[0] for f in charactor_files]
 
@@ -76,6 +80,16 @@ def __init__(
             len(mel_files) == len(charactor_files) == len(mel_lengths)
         ), f"Number of charactor, mel and duration files are different \
                 ({len(mel_files)} vs {len(charactor_files)} vs {len(mel_lengths)})."
+        
+        self.align_files = []
+
+        if len(align_query) > 1:
+          align_files = sorted(find_files(root_dir, align_query))
+          assert len(align_files) == len(mel_files),f"Number of align files ({len(align_files)}) and mel files ({len(mel_files)}) are different"
+          logging.info("Using FAL loss")
+          self.align_files = align_files
+        else:
+          logging.info("Using guided attention loss")
 
         if ".npy" in charactor_query:
             suffix = charactor_query[1:]
@@ -114,11 +128,13 @@ def generator(self, utt_ids):
         for i, utt_id in enumerate(utt_ids):
             mel_file = self.mel_files[i]
             charactor_file = self.charactor_files[i]
+            align_file = self.align_files[i] if len(self.align_files) > 1 else "" 
 
             items = {
                 "utt_ids": utt_id,
                 "mel_files": mel_file,
                 "charactor_files": charactor_file,
+                "align_files": align_file,
             }
 
             yield items
@@ -127,6 +143,8 @@ def generator(self, utt_ids):
     def _load_data(self, items):
         mel = tf.numpy_function(np.load, [items["mel_files"]], tf.float32)
         charactor = tf.numpy_function(np.load, [items["charactor_files"]], tf.int32)
+        g_att = tf.numpy_function(np.load, [items["align_files"]], tf.float32) if len(self.align_files) > 1 else None 
+        
         mel_length = len(mel)
         char_length = len(charactor)
         # padding mel to make its length is multiple of reduction factor.
@@ -149,6 +167,7 @@ def _load_data(self, items):
             "mel_gts": mel,
             "mel_lengths": mel_length,
             "real_mel_lengths": real_mel_length,
+            "g_attentions": g_att,
         }
 
         return items
@@ -187,10 +206,14 @@ def create(
         )
 
         # calculate guided attention
-        datasets = datasets.map(
-            lambda items: self._guided_attention(items),
-            tf.data.experimental.AUTOTUNE
-        )
+        if len(self.align_files) < 1:
+            datasets = datasets.map(
+                lambda items: self._guided_attention(items),
+                tf.data.experimental.AUTOTUNE
+            )
+        
+            
+          
 
         datasets = datasets.filter(
             lambda x: x["mel_lengths"] > self.mel_length_threshold
@@ -249,6 +272,7 @@ def get_output_dtypes(self):
             "utt_ids": tf.string,
             "mel_files": tf.string,
             "charactor_files": tf.string,
+            "align_files": tf.string,
         }
         return output_types
 
diff --git a/examples/tacotron2/train_tacotron2.py b/examples/tacotron2/train_tacotron2.py
@@ -336,6 +336,9 @@ def main():
         nargs="?",
         help="pretrained weights .h5 file to load weights from. Auto-skips non-matching layers",
     )
+    parser.add_argument(
+        "--use-fal", default=0, type=int, help="Use forced alignment guided attention loss or regular"
+    )
     args = parser.parse_args()
 
     # return strategy
@@ -347,6 +350,7 @@ def main():
 
     args.mixed_precision = bool(args.mixed_precision)
     args.use_norm = bool(args.use_norm)
+    args.use_fal = bool(args.use_fal)
 
     # set logger
     if args.verbose > 1:
@@ -394,6 +398,7 @@ def main():
     if config["format"] == "npy":
         charactor_query = "*-ids.npy"
         mel_query = "*-raw-feats.npy" if args.use_norm is False else "*-norm-feats.npy"
+        align_query = "*-alignment.npy" if args.use_fal is True else ""
         charactor_load_fn = np.load
         mel_load_fn = np.load
     else:
@@ -409,6 +414,7 @@ def main():
         mel_length_threshold=mel_length_threshold,
         reduction_factor=config["tacotron2_params"]["reduction_factor"],
         use_fixed_shapes=config["use_fixed_shapes"],
+        align_query=align_query,
     )
 
     # update max_mel_length and max_char_length to config
@@ -438,6 +444,7 @@ def main():
         mel_length_threshold=mel_length_threshold,
         reduction_factor=config["tacotron2_params"]["reduction_factor"],
         use_fixed_shapes=False,  # don't need apply fixed shape for evaluation.
+        align_query=align_query,
     ).create(
         is_shuffle=config["is_shuffle"],
         allow_cache=config["allow_cache"],