Fix MHSA masking (#22)

Atticus1806 · michelwi · albertz · web-flow · commit 7e345a1d362a · 2023-07-12T16:31:03.000+02:00
* fix masking

* bool

* None and tests

* cast to float

* Update i6_models/parts/conformer/mhsa.py

Co-authored-by: michelwi &lt;michelwi@users.noreply.github.com&gt;

* Update tests/test_conformer.py

Co-authored-by: michelwi &lt;michelwi@users.noreply.github.com&gt;

* Update i6_models/parts/conformer/mhsa.py

Co-authored-by: Albert Zeyer &lt;albzey@gmail.com&gt;

* add support for export

* update doc

* remove optional and onnx distinction

* remove test without mask

* Update i6_models/parts/conformer/mhsa.py

Co-authored-by: Albert Zeyer &lt;albzey@gmail.com&gt;

* move to compat

* doc

* Update i6_models/util/compat.py

Co-authored-by: Albert Zeyer &lt;albzey@gmail.com&gt;

* updates

* Update i6_models/parts/conformer/mhsa.py

Co-authored-by: Albert Zeyer &lt;albzey@gmail.com&gt;

* Update i6_models/parts/conformer/mhsa.py

Co-authored-by: Albert Zeyer &lt;albzey@gmail.com&gt;

* updates

* doc

* Update i6_models/util/compat.py

Co-authored-by: Albert Zeyer &lt;albzey@gmail.com&gt;

---------

Co-authored-by: michelwi &lt;michelwi@users.noreply.github.com&gt;
Co-authored-by: Albert Zeyer &lt;albzey@gmail.com&gt;
diff --git a/i6_models/parts/conformer/mhsa.py b/i6_models/parts/conformer/mhsa.py
@@ -2,10 +2,10 @@
 
 __all__ = ["ConformerMHSAV1", "ConformerMHSAV1Config"]
 from dataclasses import dataclass
-from typing import Optional
 import torch
 
 from i6_models.config import ModelConfiguration
+from i6_models.util import compat
 
 
 @dataclass
@@ -43,17 +43,19 @@ def __init__(self, cfg: ConformerMHSAV1Config):
         )
         self.dropout = cfg.dropout
 
-    def forward(self, input_tensor: torch.Tensor, key_padding_mask: Optional[torch.Tensor] = None) -> torch.Tensor:
+    def forward(self, input_tensor: torch.Tensor, sequence_mask: torch.Tensor) -> torch.Tensor:
         """
         Apply layer norm and multi-head self attention and dropout
-        :param Optional[torch.Tensor] key_padding_mask: could be a binary or float mask of shape (B, T)
+
+        :param input_tensor: Input to the self attention of shape (B, T, F)
+        :param sequence_mask: bool mask of shape (B, T), True signals within sequence, False outside, will be inverted to match the torch.nn.MultiheadAttention module
         which will be applied/added to dot product, used to mask padded key positions out
         """
-
+        inv_sequence_mask = compat.logical_not(sequence_mask)
         output_tensor = self.layernorm(input_tensor)  # [B,T,F]
 
         output_tensor, _ = self.mhsa(
-            output_tensor, output_tensor, output_tensor, key_padding_mask=key_padding_mask, need_weights=False
+            output_tensor, output_tensor, output_tensor, key_padding_mask=inv_sequence_mask, need_weights=False
         )  # [B,T,F]
         output_tensor = torch.nn.functional.dropout(output_tensor, p=self.dropout, training=self.training)  # [B,T,F]
 
diff --git a/i6_models/util/compat.py b/i6_models/util/compat.py
@@ -0,0 +1,18 @@
+"""
+Compatibility support for different functions. This could be for example for onnx export.
+"""
+
+import torch
+
+
+def logical_not(tensor: torch.Tensor, /) -> torch.Tensor:
+    """
+    Helper function to decide how to invert the sequence mask. For ONNX export use XOR with 1 since logical_not is not implemented.
+    Else logical_not is applied for efficiency reasons.
+
+    :param tensor: bool mask of shape (B, T) to be inverted.
+    """
+    if torch.onnx.is_in_onnx_export():
+        return torch.logical_xor(tensor, torch.ones_like(tensor))
+    else:
+        return torch.logical_not(tensor)
diff --git a/tests/test_conformer.py b/tests/test_conformer.py
@@ -57,15 +57,10 @@ def get_output_shape(input_shape, cfg, **kwargs):
 
         return list(output.shape)
 
-    # without key padding mask
-    input_shape = [3, 10, 20]  # B,T,F
-    cfg = ConformerMHSAV1Config(20, 4, 0.1, 0.1)
-    assert get_output_shape(input_shape, cfg) == [3, 10, 20]
-
     # with key padding mask
     input_shape = [4, 15, 32]  # B,T,F
     cfg = ConformerMHSAV1Config(32, 8, 0.2, 0.3)
-    assert get_output_shape(input_shape, cfg, key_padding_mask=torch.randint(0, 2, input_shape[:2]) > 0) == [4, 15, 32]
+    assert get_output_shape(input_shape, cfg, sequence_mask=(torch.randint(0, 2, input_shape[:2]) > 0)) == [4, 15, 32]
 
 
 def test_layer_norm_nc():