Merge branch 'main' into zeineldeen_att_decoder

mmz33 · mmz33 · commit 72ae8ac98669 · 2023-10-16T18:32:47.000+02:00
diff --git a/i6_models/primitives/feature_extraction.py b/i6_models/primitives/feature_extraction.py
@@ -37,7 +37,7 @@ class LogMelFeatureExtractionV1Config(ModelConfiguration):
     def __post_init__(self) -> None:
         super().__post_init__()
         assert self.f_max <= self.sample_rate // 2, "f_max can not be larger than half of the sample rate"
-        assert self.f_min > 0 and self.f_max > 0 and self.sample_rate > 0, "frequencies need to be positive"
+        assert self.f_min >= 0 and self.f_max > 0 and self.sample_rate > 0, "frequencies need to be positive"
         assert self.win_size > 0 and self.hop_size > 0, "window settings need to be positive"
         assert self.num_filters > 0, "number of filters needs to be positive"
         assert self.hop_size <= self.win_size, "using a larger hop size than window size does not make sense"
@@ -57,23 +57,25 @@ class LogMelFeatureExtractionV1(nn.Module):
 
     def __init__(self, cfg: LogMelFeatureExtractionV1Config):
         super().__init__()
-        self.register_buffer("n_fft", torch.tensor(cfg.n_fft))
-        self.register_buffer("window", torch.hann_window(int(cfg.win_size * cfg.sample_rate)))
-        self.register_buffer("hop_length", torch.tensor(int(cfg.hop_size * cfg.sample_rate)))
-        self.register_buffer("min_amp", torch.tensor(cfg.min_amp))
         self.center = cfg.center
+        self.hop_length = int(cfg.hop_size * cfg.sample_rate)
+        self.min_amp = cfg.min_amp
+        self.n_fft = cfg.n_fft
+        self.win_length = int(cfg.win_size * cfg.sample_rate)
+
         self.register_buffer(
             "mel_basis",
             torch.tensor(
                 filters.mel(
                     sr=cfg.sample_rate,
-                    n_fft=int(cfg.sample_rate * cfg.win_size),
+                    n_fft=cfg.n_fft,
                     n_mels=cfg.num_filters,
                     fmin=cfg.f_min,
                     fmax=cfg.f_max,
                 )
             ),
         )
+        self.register_buffer("window", torch.hann_window(self.win_length))
 
     def forward(self, raw_audio, length) -> Tuple[torch.Tensor, torch.Tensor]:
         """
@@ -87,6 +89,7 @@ def forward(self, raw_audio, length) -> Tuple[torch.Tensor, torch.Tensor]:
                     raw_audio,
                     n_fft=self.n_fft,
                     hop_length=self.hop_length,
+                    win_length=self.win_length,
                     window=self.window,
                     center=self.center,
                     pad_mode="constant",
@@ -99,7 +102,7 @@ def forward(self, raw_audio, length) -> Tuple[torch.Tensor, torch.Tensor]:
             # For some reason torch.stft removes the batch axis for batch sizes of 1, so we need to add it again
             power_spectrum = torch.unsqueeze(power_spectrum, 0)
         melspec = torch.einsum("...ft,mf->...mt", power_spectrum, self.mel_basis)
-        log_melspec = torch.log10(torch.max(self.min_amp, melspec))
+        log_melspec = torch.log10(torch.clamp(melspec, min=self.min_amp))
         feature_data = torch.transpose(log_melspec, 1, 2)
 
         if self.center:
diff --git a/i6_models/primitives/specaugment.py b/i6_models/primitives/specaugment.py
@@ -0,0 +1,138 @@
+import numpy as np
+import torch
+
+
+def _mask(tensor: torch.Tensor, batch_axis: int, axis: int, pos: torch.Tensor, max_len: int) -> torch.Tensor:
+    """
+    :param tensor: e.g. [B, ..., A, ...] but arbitrary axis order
+    :param batch_axis: index of the batch axis
+    :param axis: which axis A to mask
+    :param pos: at which positions along axis to start the mask (size [B])
+    :param max_len: mask length drawn uniformly from [0, max_len]
+    """
+    batch_dim_size = tensor.shape[batch_axis]
+    mask_dim_size = tensor.shape[axis]
+    mask_len = torch.randint(low=1, high=max_len + 1, size=(batch_dim_size,), dtype=torch.int32, device=tensor.device)
+    end_pos = torch.min(pos + mask_len, torch.tensor([mask_dim_size] * batch_dim_size, device=tensor.device))
+    idxs = torch.arange(0, mask_dim_size, device=tensor.device).unsqueeze(0)  # [1,dim]
+    pos_bc = pos.unsqueeze(1)  # [B,1]
+    end_pos_bc = end_pos.unsqueeze(1)  # [B,1]
+    mask = torch.logical_and(torch.greater_equal(idxs, pos_bc), torch.less(idxs, end_pos_bc))  # [B,dim]
+    if batch_axis > axis:
+        mask = mask.transpose(0, 1)  # [dim,B]
+    mask = torch.reshape(mask, shape=[tensor.shape[i] if i in (batch_axis, axis) else 1 for i in range(tensor.ndim)])
+    tensor = torch.where(mask, 0.0, tensor)
+    return tensor
+
+
+def _random_mask(tensor: torch.Tensor, batch_axis: int, axis: int, min_num: int, max_num: int, max_len: int):
+    """
+    Mask tensor along axis using N in [min_num, max_num] masks of length [0, max_len]
+
+    :param tensor: e.g. [B, ..., A, ...] but arbitrary axis order
+    :param batch_axis: index of the batch axis
+    :param axis: which axis to mask
+    :param min_num: minimum number of masks
+    :param max_num: maximum number of masks
+    :param max_amount: mask length drawn uniformly from [0, max_amount]
+    """
+
+    batch_dim_size = tensor.shape[batch_axis]
+    if max_num < min_num:
+        max_num = min_num
+    num_masks = torch.randint(min_num, max_num + 1, size=(batch_dim_size,), device="cpu")  # [B]
+
+    max_num_masks = num_masks.max().item()
+
+    z = torch.rand((batch_dim_size, tensor.shape[axis]), device=tensor.device)  # [B,dim]
+    _, indices = torch.topk(z, max_num_masks, dim=1)
+
+    # Make num_masks broadcastable to shape of tensor for torch.where.
+    num_masks = torch.reshape(num_masks, [1] * batch_axis + [batch_dim_size] + [1] * (tensor.dim() - batch_axis - 1))
+
+    num_masks = num_masks.to(device=tensor.device)
+
+    for i in range(max_num_masks):
+        tensor = torch.where(i < num_masks, _mask(tensor, batch_axis, axis, indices[:, i], max_len), tensor)
+
+    return tensor
+
+
+def specaugment_v1(
+    audio_features: torch.Tensor,
+    *,
+    time_min_num_masks: int,
+    time_max_num_masks: int,
+    time_mask_max_size: int,
+    freq_min_num_masks: int,
+    freq_max_num_masks: int,
+    freq_mask_max_size: int,
+):
+    """
+    Specaugment from legacy rossenbach/zeineldeen/zeyer attention setups e.g.,
+    https://github.com/rwth-i6/i6_experiments/blob/main/users/zeineldeen/data_aug/specaugment/specaug_tf2.py
+    but without any step-based scheduling and without dependence on length.
+    See `specaugment_v1_by_length` for a variant which is more close to the original.
+
+    Fills masks with zeros.
+
+    Basically just a convenience wrapper around _random_mask.
+
+    See also: https://arxiv.org/abs/1904.08779
+
+    :param audio_features: e.g. log-mel features as [B, T, F]
+    :param time_min_num_masks: minimum number of masks along T
+    :param time_max_num_masks: maximum number of masks along T
+    :param time_mask_max_size: maximum size of masks along T
+    :param freq_min_num_masks: minimum number of masks along F
+    :param freq_max_num_masks: maximum number of masks along F
+    :param freq_mask_max_size: maximum size of masks along F
+    :return: masked audio features
+    """
+    assert len(audio_features.shape) == 3
+    assert time_min_num_masks <= time_max_num_masks
+    assert freq_min_num_masks <= freq_max_num_masks
+    masked_audio_features = _random_mask(
+        audio_features, 0, 1, time_min_num_masks, time_max_num_masks, time_mask_max_size
+    )  # time masking
+    masked_audio_features = _random_mask(
+        masked_audio_features, 0, 2, freq_min_num_masks, freq_max_num_masks, freq_mask_max_size
+    )  # freq masking
+    return masked_audio_features
+
+
+def specaugment_v1_by_length(
+    audio_features: torch.Tensor,
+    *,
+    time_min_num_masks: int,
+    time_max_mask_per_n_frames: int,
+    time_mask_max_size: int,
+    freq_min_num_masks: int,
+    freq_max_num_masks: int,
+    freq_mask_max_size: int,
+):
+    """
+    Convenience wrapper around specaugment_v1 with time-length adaptive number of masks.
+
+    :param audio_features: e.g. log-mel features as [B, T, F]
+    :param time_max_mask_per_n_frames: used for the maximum number time masks,
+        max_num_masks = T / max_mask_per_n_frames for each batch.
+        They are still drawn depending on the full batch length, so shorter sequences
+        might get more masks than that by chance, or none at all when all masks
+        fall into the padding space.
+    :param time_min_num_masks: minimum number of masks along T
+    :param time_mask_max_size: maximum size of masks along T
+    :param freq_min_num_masks: minimum number of masks along F
+    :param freq_max_num_masks: maximum number of masks along F
+    :param freq_mask_max_size: maximum size of masks along F
+    :return: masked audio features
+    """
+    return specaugment_v1(
+        audio_features,
+        time_min_num_masks=time_min_num_masks,
+        time_max_num_masks=np.maximum(audio_features.size(1) // time_max_mask_per_n_frames, time_min_num_masks),
+        time_mask_max_size=time_mask_max_size,
+        freq_min_num_masks=freq_min_num_masks,
+        freq_max_num_masks=freq_max_num_masks,
+        freq_mask_max_size=freq_mask_max_size,
+    )
diff --git a/tests/test_blstm.py b/tests/test_blstm.py
@@ -33,7 +33,7 @@ def test_blstm_onnx_export():
                 "classes": {0: "batch", 1: "time"},
             },
         )
-        session = ort.InferenceSession(f.name)
+        session = ort.InferenceSession(f.name, providers=["CPUExecutionProvider"])
         outputs_onnx = torch.FloatTensor(
             session.run(None, {"data": dummy_data.numpy(), "data_len": dummy_data_len.numpy()})[0]
         )

Original file line number	Diff line number	Diff line change
`@@ -33,7 +33,7 @@ def test_blstm_onnx_export():`
`33`	`33`	`"classes": {0: "batch", 1: "time"},`
`34`	`34`	`},`
`35`	`35`	`)`
`36`		`- session = ort.InferenceSession(f.name)`
	`36`	`+ session = ort.InferenceSession(f.name, providers=["CPUExecutionProvider"])`
`37`	`37`	`outputs_onnx = torch.FloatTensor(`
`38`	`38`	`session.run(None, {"data": dummy_data.numpy(), "data_len": dummy_data_len.numpy()})[0]`
`39`	`39`	`)`