Update Naming

sooftware · sooftware · commit fe29a23458b9 · 2021-01-26T04:26:05.000+09:00
diff --git a/bin/inference.py b/bin/inference.py
@@ -18,13 +18,15 @@
 import numpy as np
 import torchaudio
 from torch import Tensor
+
 from kospeech.vocabs.ksponspeech import KsponSpeechVocabulary
 from kospeech.data.audio.core import load_audio
 from kospeech.models import (
     SpeechTransformer,
     Jasper,
     DeepSpeech2,
     ListenAttendSpell,
+    Conformer,
 )
 
 
@@ -67,7 +69,7 @@ def parse_audio(audio_path: str, del_silence: bool = False, audio_extension: str
 elif isinstance(model, DeepSpeech2):
     model.device = opt.device
     y_hats = model.greedy_search(feature.unsqueeze(0), input_length, opt.device)
-elif isinstance(model, SpeechTransformer) or isinstance(model, Jasper):
+elif isinstance(model, SpeechTransformer) or isinstance(model, Jasper) or isinstance(model, Conformer):
     y_hats = model.greedy_search(feature.unsqueeze(0), input_length, opt.device)
 
 sentence = vocab.label_to_string(y_hats.cpu().detach().numpy())
diff --git a/kospeech/checkpoint/checkpoint.py b/kospeech/checkpoint/checkpoint.py
@@ -16,6 +16,7 @@
 import time
 import torch
 import torch.nn as nn
+
 from kospeech.utils import logger
 from kospeech.data import SpectrogramDataset
 from kospeech.models import ListenAttendSpell
@@ -54,7 +55,7 @@ def __init__(
             optimizer: Optimizer = None,               # stores the state of the optimizer
             trainset_list: list = None,                # list of trainset
             validset: SpectrogramDataset = None,       # validation dataset
-            epoch: int = None                          # current epoch is a loop through the full training data
+            epoch: int = None,                         # current epoch is a loop through the full training data
     ) -> None:
         self.model = model
         self.optimizer = optimizer
diff --git a/kospeech/criterion/label_smoothed_cross_entropy.py b/kospeech/criterion/label_smoothed_cross_entropy.py
@@ -30,8 +30,8 @@ class LabelSmoothedCrossEntropyLoss(nn.Module):
         reduction (str): reduction method [sum, mean] (default: sum)
         architecture (str): speech model`s model [las, transformer] (default: las)
 
-    Inputs: logit, target
-        logit (torch.Tensor): probability distribution value from model and it has a logarithm shape
+    Inputs: logits, target
+        logits (torch.Tensor): probability distribution value from model and it has a logarithm shape
         target (torch.Tensor): ground-thruth encoded to integers which directly point a word in label
 
     Returns: label_smoothed
@@ -44,7 +44,7 @@ def __init__(
             smoothing: float = 0.1,     # ratio of smoothing (confidence = 1.0 - smoothing)
             dim: int = -1,              # dimension of caculation loss
             reduction='sum',            # reduction method [sum, mean]
-            architecture='las'          # speech model`s model [las, transformer]
+            architecture='las',         # speech model`s model [las, transformer]
     ) -> None:
         super(LabelSmoothedCrossEntropyLoss, self).__init__()
         self.confidence = 1.0 - smoothing
@@ -62,16 +62,16 @@ def __init__(
         else:
             raise ValueError("Unsupported reduction method {0}".format(reduction))
 
-    def forward(self, logit: Tensor, target: Tensor):
+    def forward(self, logits: Tensor, targets: Tensor):
         if self.architecture == 'transformer':
-            logit = F.log_softmax(logit, dim=-1)
+            logits = F.log_softmax(logits, dim=-1)
 
         if self.smoothing > 0.0:
             with torch.no_grad():
-                label_smoothed = torch.zeros_like(logit)
+                label_smoothed = torch.zeros_like(logits)
                 label_smoothed.fill_(self.smoothing / (self.num_classes - 1))
-                label_smoothed.scatter_(1, target.data.unsqueeze(1), self.confidence)
-                label_smoothed[target == self.ignore_index, :] = 0
-            return self.reduction_method(-label_smoothed * logit)
+                label_smoothed.scatter_(1, targets.data.unsqueeze(1), self.confidence)
+                label_smoothed[targets == self.ignore_index, :] = 0
+            return self.reduction_method(-label_smoothed * logits)
 
-        return F.cross_entropy(logit, target, ignore_index=self.ignore_index, reduction=self.reduction)
+        return F.cross_entropy(logits, targets, ignore_index=self.ignore_index, reduction=self.reduction)
diff --git a/kospeech/models/conformer/model.py b/kospeech/models/conformer/model.py
@@ -14,6 +14,7 @@
 
 import torch
 import torch.nn as nn
+import torch.nn.functional as F
 from torch import Tensor
 from typing import Tuple
 
@@ -87,7 +88,8 @@ def __init__(
 
     def forward(self, inputs: Tensor, input_lengths: Tensor) -> Tuple[Tensor, Tensor]:
         outputs, output_lengths = self.encoder(inputs, input_lengths)
-        outputs = self.fc(outputs).log_softmax(dim=-1)
+        outputs = self.fc(outputs)
+        outputs = F.log_softmax(outputs, dim=-1)
         return outputs, output_lengths
 
     def greedy_search(self, inputs: Tensor, input_lengths: Tensor, device: str):
diff --git a/kospeech/models/conv.py b/kospeech/models/conv.py
@@ -141,7 +141,7 @@ def forward(self, inputs: Tensor, input_lengths: Tensor) -> Tuple[Tensor, Tensor
 
 
 class MaskConv1d(nn.Conv1d):
-    """1D convolution with sequence masking """
+    """ 1D convolution with sequence masking """
     def __init__(
             self,
             in_channels: int,
diff --git a/kospeech/models/deepspeech2/model.py b/kospeech/models/deepspeech2/model.py
@@ -90,22 +90,22 @@ def forward(self, inputs: Tensor, input_lengths: Tensor) -> Tuple[Tensor, Tensor
         input_lengths (torch.LongTensor): (batch_size)
         """
         inputs = inputs.unsqueeze(1).permute(0, 1, 3, 2)
-        output, output_lengths = self.conv(inputs, input_lengths)
+        outputs, output_lengths = self.conv(inputs, input_lengths)
 
-        batch_size, num_channels, hidden_dim, seq_length = output.size()
-        output = output.view(batch_size, num_channels * hidden_dim, seq_length).permute(2, 0, 1).contiguous()
+        batch_size, num_channels, hidden_dim, seq_length = outputs.size()
+        outputs = outputs.view(batch_size, num_channels * hidden_dim, seq_length).permute(2, 0, 1).contiguous()
 
         for rnn_layer in self.rnn_layers:
             rnn_layer.to(self.device)
-            output = rnn_layer(output, output_lengths)
+            outputs = rnn_layer(outputs, output_lengths)
 
-        output = output.transpose(0, 1)
-        output = self.fc(output)
-        output = F.log_softmax(output, dim=-1)
+        outputs = outputs.transpose(0, 1)
+        outputs = self.fc(outputs)
+        outputs = F.log_softmax(outputs, dim=-1)
 
-        return output, output_lengths
+        return outputs, output_lengths
 
     def greedy_search(self, inputs: Tensor, input_lengths: Tensor, device: str):
         with torch.no_grad():
-            output, output_lengths = self.forward(inputs, input_lengths)
-            return output.max(-1)[1]
+            outputs, output_lengths = self.forward(inputs, input_lengths)
+            return outputs.max(-1)[1]
diff --git a/kospeech/models/jasper/decoder.py b/kospeech/models/jasper/decoder.py
@@ -60,12 +60,12 @@ def forward(self, encoder_outputs: Tensor, encoder_output_lengths: Tensor) -> Tu
         encoder_outputs (torch.FloatTensor): (batch_size, dimension, sequence_length)
         encoder_output_lengths (torch.LongTensor): (batch_size)
         """
-        output, output_lengths = encoder_outputs, encoder_output_lengths
+        outputs, output_lengths = encoder_outputs, encoder_output_lengths
 
         for i, layer in enumerate(self.layers):
-            output, output_lengths = layer(output, output_lengths)
+            outputs, output_lengths = layer(outputs, output_lengths)
 
-        output = F.log_softmax(output.transpose(1, 2), dim=-1)
+        outputs = F.log_softmax(outputs.transpose(1, 2), dim=-1)
         del encoder_outputs, encoder_output_lengths
 
-        return output, output_lengths
+        return outputs, output_lengths
diff --git a/kospeech/models/jasper/encoder.py b/kospeech/models/jasper/encoder.py
@@ -84,10 +84,10 @@ def forward(self, inputs: Tensor, input_lengths: Tensor) -> Tuple[Tensor, Tensor
             prev_output_lengths.append(input_lengths)
             residual = self._get_jasper_dencse_residual(prev_outputs, prev_output_lengths, i)
 
-        output, output_lengths = self.layers[-1](inputs, input_lengths, residual)
+        outputs, output_lengths = self.layers[-1](inputs, input_lengths, residual)
         del prev_outputs, prev_output_lengths, residual, inputs, input_lengths
 
-        return output, output_lengths
+        return outputs, output_lengths
 
     def _get_jasper_dencse_residual(self, prev_outputs: list, prev_output_lengths: list, index: int):
         residual = None
diff --git a/kospeech/models/jasper/model.py b/kospeech/models/jasper/model.py
@@ -72,10 +72,10 @@ def forward(self, inputs: Tensor, input_lengths: Tensor) -> Tuple[Tensor, Tensor
         input_lengths (torch.LongTensor): (batch_size)
         """
         encoder_outputs, output_lengths = self.encoder(inputs.transpose(1, 2), input_lengths)
-        output, output_lengths = self.decoder(encoder_outputs, output_lengths)
-        return output, output_lengths
+        outputs, output_lengths = self.decoder(encoder_outputs, output_lengths)
+        return outputs, output_lengths
 
     def greedy_search(self, inputs: Tensor, input_lengths: Tensor, device: str):
         with torch.no_grad():
-            output, output_lengths = self.forward(inputs, input_lengths)
-            return output.max(-1)[1]
+            outputs, output_lengths = self.forward(inputs, input_lengths)
+            return outputs.max(-1)[1]
diff --git a/kospeech/models/jasper/sublayers.py b/kospeech/models/jasper/sublayers.py
@@ -80,9 +80,9 @@ def forward(self, inputs: Tensor, input_lengths: Tensor, residual: Tensor) -> Tu
         for layer in self.layers[:-1]:
             inputs, input_lengths = layer(inputs, input_lengths)
 
-        output, output_lengths = self.layers[-1](inputs, input_lengths, residual)
+        outputs, output_lengths = self.layers[-1](inputs, input_lengths, residual)
 
-        return output, output_lengths
+        return outputs, output_lengths
 
 
 class JasperSubBlock(nn.Module):
@@ -145,13 +145,13 @@ def __init__(
         self.dropout = nn.Dropout(p=dropout_p)
 
     def forward(self, inputs: Tensor, input_lengths: Tensor, residual: Optional[Tensor] = None) -> Tuple[Tensor, Tensor]:
-        output, output_lengths = self.conv(inputs, input_lengths)
-        output = self.batch_norm(output)
+        outputs, output_lengths = self.conv(inputs, input_lengths)
+        outputs = self.batch_norm(outputs)
 
         if residual is not None:
-            output += residual
+            outputs += residual
 
-        output = self.dropout(self.activation(output))
+        outputs = self.dropout(self.activation(outputs))
         del inputs, input_lengths, residual
 
-        return output, output_lengths
+        return outputs, output_lengths
diff --git a/kospeech/models/las/decoder.py b/kospeech/models/las/decoder.py
@@ -117,22 +117,22 @@ def forward_step(
         if self.training:
             self.rnn.flatten_parameters()
 
-        output, hidden = self.rnn(embedded, hidden)
+        outputs, hidden = self.rnn(embedded, hidden)
 
         if self.attn_mechanism == 'loc':
-            context, attn = self.attention(output, encoder_outputs, attn)
+            context, attn = self.attention(outputs, encoder_outputs, attn)
         else:
-            context, attn = self.attention(output, encoder_outputs, encoder_outputs)
+            context, attn = self.attention(outputs, encoder_outputs, encoder_outputs)
 
-        context = torch.cat((output, context), dim=2)
+        context = torch.cat((outputs, context), dim=2)
 
-        output = self.fc1(context.view(-1, self.hidden_dim << 1)).view(batch_size, -1, self.hidden_dim)
-        output = self.fc2(torch.tanh(output).contiguous().view(-1, self.hidden_dim))
+        outputs = self.fc1(context.view(-1, self.hidden_dim << 1)).view(batch_size, -1, self.hidden_dim)
+        outputs = self.fc2(torch.tanh(outputs).contiguous().view(-1, self.hidden_dim))
 
-        step_output = F.log_softmax(output, dim=1)
-        step_output = step_output.view(batch_size, output_lengths, -1).squeeze(1)
+        step_outputs = F.log_softmax(outputs, dim=1)
+        step_outputs = step_outputs.view(batch_size, output_lengths, -1).squeeze(1)
 
-        return step_output, hidden, attn
+        return step_outputs, hidden, attn
 
     def forward(
             self,
@@ -157,8 +157,8 @@ def forward(
             if self.attn_mechanism == 'loc' or self.attn_mechanism == 'additive':
                 for di in range(inputs.size(1)):
                     input_var = inputs[:, di].unsqueeze(1)
-                    step_output, hidden, attn = self.forward_step(input_var, hidden, encoder_outputs, attn)
-                    decoder_outputs["decoder_log_probs"].append(step_output)
+                    step_outputs, hidden, attn = self.forward_step(input_var, hidden, encoder_outputs, attn)
+                    decoder_outputs["decoder_log_probs"].append(step_outputs)
 
             else:
                 step_outputs, hidden, attn = self.forward_step(inputs, hidden, encoder_outputs, attn)
@@ -171,8 +171,8 @@ def forward(
             input_var = inputs[:, 0].unsqueeze(1)
 
             for di in range(max_length):
-                step_output, hidden, attn = self.forward_step(input_var, hidden, encoder_outputs, attn)
-                decoder_outputs["decoder_log_probs"].append(step_output)
+                step_outputs, hidden, attn = self.forward_step(input_var, hidden, encoder_outputs, attn)
+                decoder_outputs["decoder_log_probs"].append(step_outputs)
                 input_var = decoder_outputs["decoder_log_probs"][-1].topk(1)[1]
 
                 if not self.training:
diff --git a/kospeech/models/las/model.py b/kospeech/models/las/model.py
@@ -69,8 +69,8 @@ def greedy_search(self, inputs: Tensor, input_lengths: Tensor, device: str) -> T
         with torch.no_grad():
             self.flatten_parameters()
             decoder_outputs, _, _ = self.forward(inputs, input_lengths, teacher_forcing_ratio=0.0)
-            output = torch.stack(decoder_outputs['decoder_log_probs'], dim=1).to(device)
-            return output.max(-1)[1]
+            outputs = torch.stack(decoder_outputs['decoder_log_probs'], dim=1).to(device)
+            return outputs.max(-1)[1]
 
     def flatten_parameters(self) -> None:
         self.encoder.rnn.flatten_parameters()
diff --git a/kospeech/models/transformer/layers.py b/kospeech/models/transformer/layers.py
@@ -48,9 +48,9 @@ def __init__(
         self.feed_forward = AddNorm(PositionWiseFeedForwardNet(d_model, d_ff, dropout_p, ffnet_style), d_model)
 
     def forward(self, inputs: Tensor, self_attn_mask: Optional[Any] = None) -> Tuple[Tensor, Tensor]:
-        output, attn = self.self_attention(inputs, inputs, inputs, self_attn_mask)
-        output = self.feed_forward(output)
-        return output, attn
+        outputs, attn = self.self_attention(inputs, inputs, inputs, self_attn_mask)
+        outputs = self.feed_forward(outputs)
+        return outputs, attn
 
 
 class SpeechTransformerDecoderLayer(nn.Module):
@@ -86,7 +86,7 @@ def forward(
             self_attn_mask: Optional[Any] = None,           # B x T_input x T_input
             memory_mask: Optional[Any] = None               # B x T_input x T_output
     ) -> Tuple[Tensor, Tensor, Tensor]:
-        output, self_attn = self.self_attention(inputs, inputs, inputs, self_attn_mask)
-        output, memory_attn = self.memory_attention(output, memory, memory, memory_mask)
-        output = self.feed_forward(output)
-        return output, self_attn, memory_attn
+        outputs, self_attn = self.self_attention(inputs, inputs, inputs, self_attn_mask)
+        outputs, memory_attn = self.memory_attention(outputs, memory, memory, memory_mask)
+        outputs = self.feed_forward(outputs)
+        return outputs, self_attn, memory_attn
diff --git a/kospeech/models/transformer/model.py b/kospeech/models/transformer/model.py
@@ -176,10 +176,10 @@ def forward(
         if self.joint_ctc_attention:
             encoder_log_probs = self.encoder_fc(memory.transpose(1, 2)).log_softmax(dim=2)
 
-        output = self.decoder(targets, input_lengths, memory)
-        output = self.decoder_fc(output)
+        outputs = self.decoder(targets, input_lengths, memory)
+        outputs = self.decoder_fc(outputs)
 
-        return output, encoder_log_probs, input_lengths
+        return outputs, encoder_log_probs, input_lengths
 
     def greedy_search(self, inputs: Tensor, input_lengths: Tensor, device: str):
         with torch.no_grad():
@@ -253,13 +253,13 @@ def __init__(
     def forward(self, inputs: Tensor, input_lengths: Tensor = None) -> Tuple[Tensor, list]:
         self_attn_mask = get_attn_pad_mask(inputs, input_lengths, inputs.size(1))
 
-        output = self.input_layer_norm(self.input_proj(inputs)) + self.positional_encoding(inputs.size(1))
-        output = self.input_dropout(output)
+        outputs = self.input_layer_norm(self.input_proj(inputs)) + self.positional_encoding(inputs.size(1))
+        outputs = self.input_dropout(outputs)
 
         for layer in self.layers:
-            output, attn = layer(output, self_attn_mask)
+            outputs, attn = layer(outputs, self_attn_mask)
 
-        return output
+        return outputs
 
 
 class SpeechTransformerDecoder(nn.Module):
@@ -311,10 +311,10 @@ def forward(self, inputs: Tensor, input_lengths: Optional[Any] = None, memory: T
         self_attn_mask = get_decoder_self_attn_mask(inputs, inputs, self.pad_id)
         memory_mask = get_attn_pad_mask(memory, input_lengths, output_length)
 
-        output = self.embedding(inputs) + self.positional_encoding(output_length)
-        output = self.input_dropout(output)
+        outputs = self.embedding(inputs) + self.positional_encoding(output_length)
+        outputs = self.input_dropout(outputs)
 
         for layer in self.layers:
-            output, self_attn, memory_attn = layer(output, memory, self_attn_mask, memory_mask)
+            outputs, self_attn, memory_attn = layer(outputs, memory, self_attn_mask, memory_mask)
 
-        return output
+        return outputs
diff --git a/kospeech/models/transformer/sublayers.py b/kospeech/models/transformer/sublayers.py
@@ -30,12 +30,12 @@ def __init__(self, sublayer: nn.Module, d_model: int = 512) -> None:
 
     def forward(self, *args):
         residual = args[0]
-        output = self.sublayer(*args)
+        outputs = self.sublayer(*args)
 
-        if isinstance(output, tuple):
-            return self.layer_norm(output[0] + residual), output[1]
+        if isinstance(outputs, tuple):
+            return self.layer_norm(outputs[0] + residual), outputs[1]
 
-        return self.layer_norm(output + residual)
+        return self.layer_norm(outputs + residual)
 
 
 class PositionWiseFeedForwardNet(nn.Module):
@@ -68,8 +68,8 @@ def __init__(self, d_model: int = 512, d_ff: int = 2048,
 
     def forward(self, inputs: Tensor) -> Tensor:
         if self.ffnet_style == 'conv':
-            output = self.conv1(inputs.transpose(1, 2))
-            output = self.relu(output)
-            return self.conv2(output).transpose(1, 2)
+            outputs = self.conv1(inputs.transpose(1, 2))
+            outputs = self.relu(outputs)
+            return self.conv2(outputs).transpose(1, 2)
 
         return self.feed_forward(inputs)