Add BlstmEncoder (#19)

JackTemaki · albertz · web-flow · commit caf4e29ad60b · 2023-06-29T14:06:54.000+02:00
Co-authored-by: Albert Zeyer &lt;albzey@gmail.com&gt;
diff --git a/.github/workflows/model_tests.yml b/.github/workflows/model_tests.yml
@@ -21,6 +21,7 @@ jobs:
       - run: |
           pip install pytest
           pip install -r requirements.txt
+          pip install -r requirements_dev.txt
       - name: Test Models
         run: |
           python -m pytest tests
diff --git a/i6_models/parts/blstm.py b/i6_models/parts/blstm.py
@@ -0,0 +1,84 @@
+from dataclasses import dataclass
+import torch
+from torch import nn
+
+from i6_models.config import ModelConfiguration
+
+
+@dataclass
+class BlstmEncoderV1Config(ModelConfiguration):
+    """
+    Attributes:
+        num_layers: number of bi-directional LSTM layers, minimum 2
+        input_dim: input dimension size
+        hidden_dim: hidden dimension of one direction of LSTM, the total output size is twice of this
+        dropout: nn.LSTM supports internal Dropout applied between each layer of BLSTM (but not on input/output)
+        enforce_sorted:
+            True: expects that sequences are sorted by sequence length in decreasing order.
+                Will not do any sorting.
+                This is required for ONNX-Export, and thus the recommended setting.
+            False: no expectation.
+                It will internally enforce that they are sorted
+                and undo the reordering at the output.
+
+            Sorting can for example be performed independent of the ONNX export in e.g. train_step:
+
+                audio_features_len, indices = torch.sort(audio_features_len, descending=True)
+                audio_features = audio_features[indices, :, :]
+                labels = labels[indices, :]
+                labels_len = labels_len[indices]
+    """
+
+    num_layers: int
+    input_dim: int
+    hidden_dim: int
+    dropout: float
+    enforce_sorted: bool
+
+
+class BlstmEncoderV1(torch.nn.Module):
+    """
+    Simple multi-layer BLSTM model including dropout, batch-first variant,
+    hardcoded to use B,T,F input
+
+    supports: TorchScript, ONNX-export
+    """
+
+    def __init__(self, config: BlstmEncoderV1Config):
+        """
+        :param config: configuration object
+        """
+        super().__init__()
+        self.dropout = config.dropout
+        self.enforce_sorted = config.enforce_sorted
+        self.blstm_stack = nn.LSTM(
+            input_size=config.input_dim,
+            hidden_size=config.hidden_dim,
+            bidirectional=True,
+            num_layers=config.num_layers,
+            batch_first=True,
+            dropout=self.dropout,
+        )
+
+    def forward(self, x: torch.Tensor, seq_len: torch.Tensor) -> torch.Tensor:
+        """
+        :param x: [B, T, input_dim]
+        :param seq_len: [B], should be on CPU for Script/Trace mode
+        :return [B, T, 2 * hidden_dim]
+        """
+        if not torch.jit.is_scripting() and not torch.jit.is_tracing():
+            # during graph mode we have to assume all Tensors are on the correct device,
+            # otherwise move lengths to the CPU if they are on GPU
+            if seq_len.get_device() >= 0:
+                seq_len = seq_len.cpu()
+
+        blstm_packed_in = nn.utils.rnn.pack_padded_sequence(
+            input=x,
+            lengths=seq_len,
+            enforce_sorted=self.enforce_sorted,
+            batch_first=True,
+        )
+        blstm_out, _ = self.blstm_stack(blstm_packed_in)
+        blstm_out, _ = nn.utils.rnn.pad_packed_sequence(blstm_out, padding_value=0.0, batch_first=True)
+
+        return blstm_out
diff --git a/requirements_dev.txt b/requirements_dev.txt
@@ -0,0 +1,2 @@
+onnx
+onnxruntime
diff --git a/tests/test_blstm.py b/tests/test_blstm.py
@@ -0,0 +1,60 @@
+import onnxruntime as ort
+import tempfile
+import torch
+from torch.onnx import export as export_onnx
+
+from i6_models.parts.blstm import BlstmEncoderV1, BlstmEncoderV1Config
+
+
+def test_blstm_onnx_export():
+    with torch.no_grad(), tempfile.NamedTemporaryFile() as f:
+        config = BlstmEncoderV1Config(num_layers=4, input_dim=50, hidden_dim=128, dropout=0.1, enforce_sorted=True)
+        model = BlstmEncoderV1(config=config)
+        scripted_model = torch.jit.optimize_for_inference(torch.jit.script(model.eval()))
+
+        dummy_data = torch.randn(3, 30, 50)
+        dummy_data_len = torch.IntTensor([30, 20, 15])
+        dummy_data_len_2 = torch.IntTensor([30, 15, 10])
+
+        outputs_normal = model(dummy_data, dummy_data_len)
+        outputs_scripted = scripted_model(dummy_data, dummy_data_len)
+        assert torch.allclose(outputs_normal, outputs_scripted)
+        export_onnx(
+            scripted_model,
+            (dummy_data, dummy_data_len),
+            f=f,
+            verbose=True,
+            input_names=["data", "data_len"],
+            output_names=["classes"],
+            dynamic_axes={
+                # dict value: manually named axes
+                "data": {0: "batch", 1: "time"},
+                "data_len": {0: "batch"},
+                "classes": {0: "batch", 1: "time"},
+            },
+        )
+        session = ort.InferenceSession(f.name)
+        outputs_onnx = torch.FloatTensor(
+            session.run(None, {"data": dummy_data.numpy(), "data_len": dummy_data_len.numpy()})[0]
+        )
+        outputs_onnx_other = torch.FloatTensor(
+            session.run(None, {"data": dummy_data.numpy(), "data_len": dummy_data_len_2.numpy()})[0]
+        )
+        # The default 1e-8 was slightly too strong
+        assert torch.allclose(outputs_normal, outputs_onnx, atol=1e-6)
+        # check that for different lengths we really get a different result
+        assert not torch.allclose(outputs_normal, outputs_onnx_other, atol=1e-6)
+
+        # check with different batching and max size
+        outputs_onnx_diff_batch = torch.FloatTensor(
+            session.run(
+                None,
+                {
+                    "data": dummy_data[(1, 2), :20, :].numpy(),
+                    "data_len": dummy_data_len[
+                        (1, 2),
+                    ].numpy(),
+                },
+            )[0]
+        )
+        assert torch.allclose(outputs_normal[2, :20], outputs_onnx_diff_batch[1], atol=1e-6)