refactor + implement zoneout

mmz33 · mmz33 · commit b661ceece978 · 2023-07-27T11:12:32.000Z
diff --git a/i6_models/decoder/attention.py b/i6_models/decoder/attention.py
@@ -4,6 +4,8 @@
 import torch
 from torch import nn
 
+from .zoneout_lstm import ZoneoutLSTMCell
+
 
 @dataclass
 class AdditiveAttentionConfig:
@@ -46,7 +48,6 @@ def forward(
         :param enc_seq_len: encoder sequence lengths [B]
         :return: attention context [B,D_v], attention weights [B,T,1]
         """
-
         # all inputs are already projected
         energies = self.linear(nn.functional.tanh(key + query.unsqueeze(1) + weight_feedback))  # [B,T,1]
         time_arange = torch.arange(energies.size(1))  # [T]
@@ -60,14 +61,16 @@ def forward(
 
 
 @dataclass
-class AttentionLstmDecoderV1Config:
+class AttentionLSTMDecoderV1Config:
     """
     Attributes:
         encoder_dim: encoder dimension
         vocab_size: vocabulary size
         target_embed_dim: embedding dimension
         target_embed_dropout: embedding dropout
         lstm_hidden_size: LSTM hidden size
+        zoneout_drop_h: zoneout drop probability for hidden state
+        zoneout_drop_c: zoneout drop probability for cell state
         attention_cfg: attention config
         output_proj_dim: output projection dimension
         output_dropout: output dropout
@@ -78,26 +81,36 @@ class AttentionLstmDecoderV1Config:
     target_embed_dim: int
     target_embed_dropout: float
     lstm_hidden_size: int
+    zoneout_drop_h: float
+    zoneout_drop_c: float
     attention_cfg: AdditiveAttentionConfig
     output_proj_dim: int
     output_dropout: float
 
 
-class AttentionLstmDecoderV1(nn.Module):
+class AttentionLSTMDecoderV1(nn.Module):
     """
     Single-headed Attention decoder with additive attention mechanism.
     """
 
-    def __init__(self, cfg: AttentionLstmDecoderV1Config):
+    def __init__(self, cfg: AttentionLSTMDecoderV1Config):
         super().__init__()
 
         self.target_embed = nn.Embedding(num_embeddings=cfg.vocab_size, embedding_dim=cfg.target_embed_dim)
         self.target_embed_dropout = nn.Dropout(cfg.target_embed_dropout)
 
-        self.s = nn.LSTMCell(
+        lstm_cell = nn.LSTMCell(
             input_size=cfg.target_embed_dim + cfg.encoder_dim,
             hidden_size=cfg.lstm_hidden_size,
         )
+        self.lstm_hidden_size = cfg.lstm_hidden_size
+        # if zoneout drop probs are 0, then it is equivalent to normal LSTMCell
+        self.s = ZoneoutLSTMCell(
+            cell=lstm_cell,
+            zoneout_h=cfg.zoneout_drop_h,
+            zoneout_c=cfg.zoneout_drop_c,
+        )
+
         self.s_transformed = nn.Linear(cfg.lstm_hidden_size, cfg.attention_cfg.attention_dim, bias=False)  # query
 
         # for attention
@@ -127,7 +140,8 @@ def forward(
         :param state: decoder state
         """
         if state is None:
-            lstm_state = None
+            zeros = torch.zeros((encoder_outputs.size(0), self.lstm_hidden_size))
+            lstm_state = (zeros, zeros)
             att_context = torch.zeros((encoder_outputs.size(0), encoder_outputs.size(2)))
             accum_att_weights = encoder_outputs.new_zeros((encoder_outputs.size(0), encoder_outputs.size(1), 1))
         else:
diff --git a/tests/test_enc_dec_att.py b/tests/test_enc_dec_att.py
@@ -2,7 +2,7 @@
 from torch import nn
 
 from i6_models.decoder.attention import AdditiveAttention, AdditiveAttentionConfig
-from i6_models.decoder.attention import AttentionLstmDecoderV1, AttentionLstmDecoderV1Config
+from i6_models.decoder.attention import AttentionLSTMDecoderV1, AttentionLSTMDecoderV1Config
 
 
 def test_additive_attention():
@@ -29,7 +29,7 @@ def test_additive_attention():
 def test_encoder_decoder_attention_model():
     encoder = torch.rand((10, 20, 5))
     encoder_seq_len = torch.arange(start=10, end=20)  # [10, ..., 19]
-    decoder_cfg = AttentionLstmDecoderV1Config(
+    decoder_cfg = AttentionLSTMDecoderV1Config(
         encoder_dim=5,
         vocab_size=15,
         target_embed_dim=3,
@@ -38,10 +38,41 @@ def test_encoder_decoder_attention_model():
         attention_cfg=AdditiveAttentionConfig(attention_dim=10, att_weights_dropout=0.1),
         output_proj_dim=12,
         output_dropout=0.1,
+        zoneout_drop_c=0.0,
+        zoneout_drop_h=0.0,
     )
-    decoder = AttentionLstmDecoderV1(decoder_cfg)
+    decoder = AttentionLSTMDecoderV1(decoder_cfg)
     target_labels = torch.randint(low=0, high=15, size=(10, 7))  # [B,N]
 
     decoder_logits, _ = decoder(encoder_outputs=encoder, labels=target_labels, enc_seq_len=encoder_seq_len)
 
     assert decoder_logits.shape == (10, 7, 15)
+
+
+def test_zoneout_lstm_cell():
+    encoder = torch.rand((10, 20, 5))
+    encoder_seq_len = torch.arange(start=10, end=20)  # [10, ..., 19]
+    target_labels = torch.randint(low=0, high=15, size=(10, 7))  # [B,N]
+
+    def forward_decoder(zoneout_drop_c: float, zoneout_drop_h: float):
+        decoder_cfg = AttentionLSTMDecoderV1Config(
+            encoder_dim=5,
+            vocab_size=15,
+            target_embed_dim=3,
+            target_embed_dropout=0.1,
+            lstm_hidden_size=12,
+            attention_cfg=AdditiveAttentionConfig(attention_dim=10, att_weights_dropout=0.1),
+            output_proj_dim=12,
+            output_dropout=0.1,
+            zoneout_drop_c=zoneout_drop_c,
+            zoneout_drop_h=zoneout_drop_h,
+        )
+        decoder = AttentionLSTMDecoderV1(decoder_cfg)
+        decoder_logits, _ = decoder(encoder_outputs=encoder, labels=target_labels, enc_seq_len=encoder_seq_len)
+        return decoder_logits
+
+    decoder_logits = forward_decoder(zoneout_drop_c=0.15, zoneout_drop_h=0.05)
+    assert decoder_logits.shape == (10, 7, 15)
+
+    decoder_logits = forward_decoder(zoneout_drop_c=0.0, zoneout_drop_h=0.0)
+    assert decoder_logits.shape == (10, 7, 15)