|
| 1 | +from dataclasses import dataclass |
| 2 | +import torch |
| 3 | +from torch import nn |
| 4 | + |
| 5 | +from i6_models.config import ModelConfiguration |
| 6 | + |
| 7 | + |
| 8 | +@dataclass |
| 9 | +class BlstmEncoderV1Config(ModelConfiguration): |
| 10 | + """ |
| 11 | + Attributes: |
| 12 | + num_layers: number of bi-directional LSTM layers, minimum 2 |
| 13 | + input_dim: input dimension size |
| 14 | + hidden_dim: hidden dimension of one direction of LSTM, the total output size is twice of this |
| 15 | + dropout: nn.LSTM supports internal Dropout applied between each layer of BLSTM (but not on input/output) |
| 16 | + enforce_sorted: |
| 17 | + True: expects that sequences are sorted by sequence length in decreasing order. |
| 18 | + Will not do any sorting. |
| 19 | + This is required for ONNX-Export, and thus the recommended setting. |
| 20 | + False: no expectation. |
| 21 | + It will internally enforce that they are sorted |
| 22 | + and undo the reordering at the output. |
| 23 | +
|
| 24 | + Sorting can for example be performed independent of the ONNX export in e.g. train_step: |
| 25 | +
|
| 26 | + audio_features_len, indices = torch.sort(audio_features_len, descending=True) |
| 27 | + audio_features = audio_features[indices, :, :] |
| 28 | + labels = labels[indices, :] |
| 29 | + labels_len = labels_len[indices] |
| 30 | + """ |
| 31 | + |
| 32 | + num_layers: int |
| 33 | + input_dim: int |
| 34 | + hidden_dim: int |
| 35 | + dropout: float |
| 36 | + enforce_sorted: bool |
| 37 | + |
| 38 | + |
| 39 | +class BlstmEncoderV1(torch.nn.Module): |
| 40 | + """ |
| 41 | + Simple multi-layer BLSTM model including dropout, batch-first variant, |
| 42 | + hardcoded to use B,T,F input |
| 43 | +
|
| 44 | + supports: TorchScript, ONNX-export |
| 45 | + """ |
| 46 | + |
| 47 | + def __init__(self, config: BlstmEncoderV1Config): |
| 48 | + """ |
| 49 | + :param config: configuration object |
| 50 | + """ |
| 51 | + super().__init__() |
| 52 | + self.dropout = config.dropout |
| 53 | + self.enforce_sorted = config.enforce_sorted |
| 54 | + self.blstm_stack = nn.LSTM( |
| 55 | + input_size=config.input_dim, |
| 56 | + hidden_size=config.hidden_dim, |
| 57 | + bidirectional=True, |
| 58 | + num_layers=config.num_layers, |
| 59 | + batch_first=True, |
| 60 | + dropout=self.dropout, |
| 61 | + ) |
| 62 | + |
| 63 | + def forward(self, x: torch.Tensor, seq_len: torch.Tensor) -> torch.Tensor: |
| 64 | + """ |
| 65 | + :param x: [B, T, input_dim] |
| 66 | + :param seq_len: [B], should be on CPU for Script/Trace mode |
| 67 | + :return [B, T, 2 * hidden_dim] |
| 68 | + """ |
| 69 | + if not torch.jit.is_scripting() and not torch.jit.is_tracing(): |
| 70 | + # during graph mode we have to assume all Tensors are on the correct device, |
| 71 | + # otherwise move lengths to the CPU if they are on GPU |
| 72 | + if seq_len.get_device() >= 0: |
| 73 | + seq_len = seq_len.cpu() |
| 74 | + |
| 75 | + blstm_packed_in = nn.utils.rnn.pack_padded_sequence( |
| 76 | + input=x, |
| 77 | + lengths=seq_len, |
| 78 | + enforce_sorted=self.enforce_sorted, |
| 79 | + batch_first=True, |
| 80 | + ) |
| 81 | + blstm_out, _ = self.blstm_stack(blstm_packed_in) |
| 82 | + blstm_out, _ = nn.utils.rnn.pad_packed_sequence(blstm_out, padding_value=0.0, batch_first=True) |
| 83 | + |
| 84 | + return blstm_out |
0 commit comments