rwth-i6
diff --git a/‎i6_models/parts/frontend/README.md‎
Lines changed: 12 additions & 0 deletions b/‎i6_models/parts/frontend/README.md‎
Lines changed: 12 additions & 0 deletions
diff --git a/‎i6_models/parts/frontend/__init__.py‎ b/‎i6_models/parts/frontend/__init__.py‎
diff --git a/‎i6_models/parts/frontend/common.py‎
Lines changed: 48 additions & 0 deletions b/‎i6_models/parts/frontend/common.py‎
Lines changed: 48 additions & 0 deletions
diff --git a/‎i6_models/parts/frontend/vgg_act.py‎
Lines changed: 266 additions & 0 deletions b/‎i6_models/parts/frontend/vgg_act.py‎
Lines changed: 266 additions & 0 deletions
@@ -0,0 +1,12 @@
+# Different front-ends for acoustic encoders
+
+### Contributing
+
+If you want to add your own front-end:
+
+- Normally two classes are required. A config class and a model class
+- `Config` class inherits from `ModelConfiguration`
+- `Model` class inherits from `nn.Module` from `torch`
+- `forward(tensor: torch.Tensor, sequence_mask: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]`
+- `sequence_mask` is a boolean tensor where `True` means is inside the sequence and `False` is masked.
+- Please add tests
@@ -0,0 +1,48 @@
+from typing import Tuple, Union
+
+import torch
+from torch import nn
+from torch.nn import functional
+
+
+def get_same_padding(input_size: Union[int, Tuple[int, ...]]) -> Union[int, Tuple[int, ...]]:
+    """
+    get padding in order to not reduce the time dimension
+
+    :param input_size:
+    :return:
+    """
+    if isinstance(input_size, int):
+        return (input_size - 1) // 2
+    elif isinstance(input_size, tuple):
+        return tuple((s - 1) // 2 for s in input_size)
+    else:
+        raise TypeError(f"unexpected size type {type(input_size)}")
+
+
+def mask_pool(seq_mask: torch.Tensor, *, kernel_size: int, stride: int, padding: int) -> torch.Tensor:
+    """
+    apply strides to the masking
+
+    :param seq_mask: [B,T]
+    :param kernel_size:
+    :param stride:
+    :param padding:
+    :return: [B,T'] using maxpool
+    """
+    if stride == 1 and 2 * padding == kernel_size - 1:
+        return seq_mask
+
+    seq_mask = seq_mask.float()
+    seq_mask = torch.unsqueeze(seq_mask, 1)  # [B,1,T]
+    seq_mask = nn.functional.max_pool1d(seq_mask, kernel_size, stride, padding)  # [B,1,T']
+    seq_mask = torch.squeeze(seq_mask, 1)  # [B,T']
+    seq_mask = seq_mask.bool()
+    return seq_mask
+
+
+def calculate_output_dim(in_dim: int, *, filter_size: int, stride: int, padding: int) -> int:
+    def ceildiv(a: int, b: int):
+        return -(-a // b)
+
+    return ceildiv(in_dim + 2 * padding - (filter_size - 1) * 1, stride)
@@ -0,0 +1,266 @@
+from __future__ import annotations
+
+__all__ = [
+    "VGG4LayerActFrontendV1",
+    "VGG4LayerActFrontendV1Config",
+]
+
+from dataclasses import dataclass
+from typing import Callable, Optional, Tuple, Union
+
+import torch
+from torch import nn
+
+from i6_models.config import ModelConfiguration
+
+from .common import get_same_padding, mask_pool, calculate_output_dim
+
+
+@dataclass
+class VGG4LayerActFrontendV1Config(ModelConfiguration):
+    """
+    Attributes:
+        in_features: number of input features to module
+        conv1_channels: number of channels for first conv layer
+        conv2_channels: number of channels for second conv layer
+        conv3_channels: number of channels for third conv layer
+        conv4_channels: number of channels for fourth conv layer
+        conv_kernel_size: kernel size of conv layers
+        conv_padding: padding for the convolution
+        pool1_kernel_size: kernel size of first pooling layer
+        pool1_stride: stride of first pooling layer
+        pool1_padding: padding for first pooling layer
+        pool2_kernel_size: kernel size of second pooling layer
+        pool2_stride: stride of second pooling layer
+        pool2_padding: padding for second pooling layer
+        activation: activation function at the end
+        out_features: output size of the final linear layer
+    """
+
+    in_features: int
+    conv1_channels: int
+    conv2_channels: int
+    conv3_channels: int
+    conv4_channels: int
+    conv_kernel_size: Tuple[int, int]
+    conv_padding: Optional[Tuple[int, int]]
+    pool1_kernel_size: Tuple[int, int]
+    pool1_stride: Optional[Tuple[int, int]]
+    pool1_padding: Optional[Tuple[int, int]]
+    pool2_kernel_size: Tuple[int, int]
+    pool2_stride: Optional[Tuple[int, int]]
+    pool2_padding: Optional[Tuple[int, int]]
+    activation: Union[nn.Module, Callable[[torch.Tensor], torch.Tensor]]
+    out_features: int
+
+    def check_valid(self):
+        if isinstance(self.conv_kernel_size, int):
+            assert self.conv_kernel_size % 2 == 1, "ConformerVGGFrontendV1 only supports odd kernel sizes"
+        if isinstance(self.pool1_kernel_size, int):
+            assert self.pool1_kernel_size % 2 == 1, "ConformerVGGFrontendV1 only supports odd kernel sizes"
+        if isinstance(self.pool2_kernel_size, int):
+            assert self.pool2_kernel_size % 2 == 1, "ConformerVGGFrontendV1 only supports odd kernel sizes"
+
+    def __post__init__(self):
+        super().__post_init__()
+        self.check_valid()
+
+
+class VGG4LayerActFrontendV1(nn.Module):
+    """
+    Convolutional Front-End
+
+    The frond-end utilizes convolutional and pooling layers, as well as activation functions
+    to transform a feature vector, typically Log-Mel or Gammatone for audio, into an intermediate
+    representation.
+
+    Structure of the front-end:
+      - Conv
+      - Conv
+      - Activation
+      - Pool
+      - Conv
+      - Conv
+      - Activation
+      - Pool
+
+    Uses explicit padding for ONNX exportability, see:
+    https://github.com/pytorch/pytorch/issues/68880
+    """
+
+    def __init__(self, model_cfg: VGG4LayerActFrontendV1Config):
+        """
+        :param model_cfg: model configuration for this module
+        """
+        super().__init__()
+
+        model_cfg.check_valid()
+
+        self.cfg = model_cfg
+
+        conv_padding = (
+            model_cfg.conv_padding
+            if model_cfg.conv_padding is not None
+            else get_same_padding(model_cfg.conv_kernel_size)
+        )
+        pool1_padding = model_cfg.pool1_padding if model_cfg.pool1_padding is not None else (0, 0)
+        pool2_padding = model_cfg.pool2_padding if model_cfg.pool2_padding is not None else (0, 0)
+
+        self.conv1 = nn.Conv2d(
+            in_channels=1,
+            out_channels=model_cfg.conv1_channels,
+            kernel_size=model_cfg.conv_kernel_size,
+            padding=conv_padding,
+        )
+        self.conv2 = nn.Conv2d(
+            in_channels=model_cfg.conv1_channels,
+            out_channels=model_cfg.conv2_channels,
+            kernel_size=model_cfg.conv_kernel_size,
+            padding=conv_padding,
+        )
+        self.pool1 = nn.MaxPool2d(
+            kernel_size=model_cfg.pool1_kernel_size,
+            stride=model_cfg.pool1_stride,
+            padding=pool1_padding,
+        )
+        self.conv3 = nn.Conv2d(
+            in_channels=model_cfg.conv2_channels,
+            out_channels=model_cfg.conv3_channels,
+            kernel_size=model_cfg.conv_kernel_size,
+            padding=conv_padding,
+        )
+        self.conv4 = nn.Conv2d(
+            in_channels=model_cfg.conv3_channels,
+            out_channels=model_cfg.conv4_channels,
+            kernel_size=model_cfg.conv_kernel_size,
+            padding=conv_padding,
+        )
+        self.pool2 = nn.MaxPool2d(
+            kernel_size=model_cfg.pool2_kernel_size,
+            stride=model_cfg.pool2_stride,
+            padding=pool2_padding,
+        )
+        self.activation = model_cfg.activation
+        self.linear = nn.Linear(
+            in_features=self._calculate_dim(),
+            out_features=model_cfg.out_features,
+            bias=True,
+        )
+
+    def forward(self, tensor: torch.Tensor, sequence_mask: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
+        """
+        T might be reduced to T' or T'' depending on stride of the layers
+
+        stride is only allowed for the pool1 and pool2 operation.
+        other ops do not have stride configurable -> no update of mask sequence required but added anyway
+
+        :param tensor: input tensor of shape [B,T,F]
+        :param sequence_mask: the sequence mask for the tensor
+        :return: torch.Tensor of shape [B,T",F'] and the shape of the sequence mask
+        """
+        assert tensor.shape[-1] == self.cfg.in_features
+        # and add a dim
+        tensor = tensor[:, None, :, :]  # [B,C=1,T,F]
+
+        tensor = self.conv1(tensor)
+        sequence_mask = mask_pool(
+            seq_mask=sequence_mask,
+            kernel_size=self.conv1.kernel_size[0],
+            stride=self.conv1.stride[0],
+            padding=self.conv1.padding[0],
+        )
+
+        tensor = self.conv2(tensor)
+        sequence_mask = mask_pool(
+            sequence_mask,
+            kernel_size=self.conv2.kernel_size[0],
+            stride=self.conv2.stride[0],
+            padding=self.conv2.padding[0],
+        )
+
+        tensor = self.activation(tensor)
+        tensor = self.pool1(tensor)  # [B,C,T',F']
+        sequence_mask = mask_pool(
+            sequence_mask,
+            kernel_size=self.pool1.kernel_size[0],
+            stride=self.pool1.stride[0],
+            padding=self.pool1.padding[0],
+        )
+
+        tensor = self.conv3(tensor)
+        sequence_mask = mask_pool(
+            sequence_mask,
+            kernel_size=self.conv3.kernel_size[0],
+            stride=self.conv3.stride[0],
+            padding=self.conv3.padding[0],
+        )
+
+        tensor = self.conv4(tensor)
+        sequence_mask = mask_pool(
+            sequence_mask,
+            kernel_size=self.conv4.kernel_size[0],
+            stride=self.conv4.stride[0],
+            padding=self.conv4.padding[0],
+        )
+
+        tensor = self.activation(tensor)
+        tensor = self.pool2(tensor)  # [B,C,T",F"]
+        sequence_mask = mask_pool(
+            sequence_mask,
+            kernel_size=self.pool2.kernel_size[0],
+            stride=self.pool2.stride[0],
+            padding=self.pool2.padding[0],
+        )
+
+        tensor = torch.transpose(tensor, 1, 2)  # transpose to [B,T",C,F"]
+        tensor = torch.flatten(tensor, start_dim=2, end_dim=-1)  # [B,T",C*F"]
+
+        tensor = self.linear(tensor)
+
+        return tensor, sequence_mask
+
+    def _calculate_dim(self) -> int:
+        # conv1
+        out_dim = calculate_output_dim(
+            in_dim=self.cfg.in_features,
+            filter_size=self.conv1.kernel_size[1],
+            stride=self.conv1.stride[1],
+            padding=self.conv1.padding[1],
+        )
+        # conv2
+        out_dim = calculate_output_dim(
+            in_dim=out_dim,
+            filter_size=self.conv2.kernel_size[1],
+            stride=self.conv2.stride[1],
+            padding=self.conv2.padding[1],
+        )
+        # pool1
+        out_dim = calculate_output_dim(
+            in_dim=out_dim,
+            filter_size=self.pool1.kernel_size[1],
+            stride=self.pool1.stride[1],
+            padding=self.pool1.padding[1],
+        )
+        # conv3
+        out_dim = calculate_output_dim(
+            in_dim=out_dim,
+            filter_size=self.conv3.kernel_size[1],
+            stride=self.conv3.stride[1],
+            padding=self.conv3.padding[1],
+        )
+        # conv4
+        out_dim = calculate_output_dim(
+            in_dim=out_dim,
+            filter_size=self.conv4.kernel_size[1],
+            stride=self.conv4.stride[1],
+            padding=self.conv4.padding[1],
+        )
+        # pool2
+        out_dim = calculate_output_dim(
+            in_dim=out_dim,
+            filter_size=self.pool2.kernel_size[1],
+            stride=self.pool2.stride[1],
+            padding=self.pool2.padding[1],
+        )
+        out_dim *= self.conv4.out_channels
+        return out_dim