add topology and document alignment and loss (#62)

mikel-zhobro · web-flow · commit af48c964f910 · 2021-04-16T14:00:40.000+02:00
diff --git a/common/models/transducer/alignment.py b/common/models/transducer/alignment.py
@@ -1,16 +1,22 @@
 from returnn.tf.util.basic import get_shape_dim, check_input_dim
+from returnn.tf.util.data import Data
+from returnn.tf.layers.basic import LayerBase
+import tensorflow as tf
+from typing import List
 
 
-def rna_alignment(source, **kwargs):
+def rna_alignment(source, **kwargs) -> tf.Tensor:
+  """ Used only to create alignments according to RNA loss function.
+  B: batch, T: time, U:target/labels, V: vocabulary
+  Args:
+      source: function (i: int, as_data: bool = False, ...) -> tf.Tensor|Data
+              which returns one of:
+        output_log_prob: [B, T, U+1, V] log-probabilities
+        real_target: [B, U] -> [V] target labels
+        base:encoder: [B, T, Feat] -> [V] encoder output
+  Returns:
+      alignment: [B, T] which holds a value from interval (0:blank_ix) for each alignment frame
   """
-  Used only to create alignments according to RNA loss function.
-  :sources: [output_log_prob, real_target, "base:encoder"]
-  :return: alignments: [B, T] for each frame a value in [0:blank_ix]
-  """
-  # acts: (B, T, U, V)
-  # targets: (B, U-1)
-  # input_lengths (B,)
-  # label_lengths (B,)
   from .rna_align_sum_max_pure_tf import tf_forward_shifted_rna
 
   log_probs = source(0, as_data=True, auto_convert=False).get_placeholder_as_batch_major()
@@ -28,22 +34,23 @@ def rna_alignment(source, **kwargs):
   #                     "log-probs:", tf.shape(log_probs.get_placeholder_as_batch_major())], summarize=-1)
 
   blank_idx = targets.dim  # targets is without blank
-  costs, alignment = tf_forward_shifted_rna(log_probs, targets.get_placeholder_as_batch_major(), enc_lens, dec_lens,
-                                            blank_index=blank_idx, debug=False, with_alignment=True)
-  return alignment  # (B, T)
+  _, alignment = tf_forward_shifted_rna(log_probs, targets.get_placeholder_as_batch_major(), enc_lens, dec_lens,
+                                        blank_index=blank_idx, debug=False, with_alignment=True)
+  return alignment  # [B, T]
 
 
-def rnnt_alignment(source, **kwargs):
-  """
-  Used only to create alignments according to RNNT loss function.
-  :sources: [output_log_prob, real_target, "base:encoder"]
-  :return: alignments: [B, T] for each frame a value in [0:blank_ix]
+def rnnt_alignment(source, **kwargs) -> tf.Tensor:
+  """ Used only to create alignments according to RNNT loss function.
+  B: batch, T: time, U:target/labels, V: vocabulary
+  Args:
+      source: function (i: int, as_data: bool = False, ...) -> tf.Tensor|Data
+              which returns one of:
+        output_log_prob: [B, T, U+1, V] log-probabilities
+        real_target: [B, U] -> [V] target labels
+        base:encoder: [B, T, Feat] -> [V] encoder output
+  Returns:
+      alignment: [B, T+U] which holds a value from interval (0:blank_ix) for each alignment frame
   """
-  # alignment-length (B,T+U+1)
-  # acts: (B, T, U+1, V)
-  # targets: (B, U)
-  # input_lengths (B,)
-  # label_lengths (B,)
   from .rnnt_align_sum_max_pure_tf import tf_forward_shifted_rnnt
 
   log_probs = source(0, as_data=True, auto_convert=False).get_placeholder_as_batch_major()
@@ -63,4 +70,34 @@ def rnnt_alignment(source, **kwargs):
   blank_idx = targets.dim
   _, alignment = tf_forward_shifted_rnnt(log_probs, targets.get_placeholder_as_batch_major(), enc_lens, dec_lens,
                                          blank_index=blank_idx, debug=False, with_alignment=True)
-  return alignment  # (B, T)
+  return alignment  # [B, T+U]
+
+
+def rna_alignment_out_type(sources: List[LayerBase], **_kwargs) -> Data:
+  """ Computes the rna-alignment Data_out_type for RNA alignment
+  B: batch, T: time, U:target/labels, V: vocabulary
+  Args:
+      sources:
+        output_log_prob: [B, T, U+1, V] log-probabilities
+        real_target: [B, U] -> [V] target labels
+        base:encoder: [B, T, Feat] -> [V] encoder output
+  Returns:
+      alignment [B, T]
+  """
+  return Data(name="rna_alignment_output", sparse=True, dim=sources[0].output.dim,
+              size_placeholder={0: sources[2].output.size_placeholder[0]})
+
+
+def rnnt_alignment_out_type(sources: List[LayerBase], **_kwargs) -> Data:
+  """ Computes the rnnt-alignment Data_out_type for RNNT alignment
+  B: batch, T: time, U:target/labels, V: vocabulary
+  Args:
+      sources:
+        output_log_prob: [B, T, U+1, V] log-probabilities
+        real_target: [B, U] -> [V] target labels
+        base:encoder: [B, T, Feat] -> [V] encoder output
+  Returns:
+      alignment [B, T+U]
+  """
+  return Data(name="rnnt_alignment_output", sparse=True, dim=sources[0].output.dim,
+              size_placeholder={0: sources[1].output.size_placeholder[0] + sources[2].output.size_placeholder[0]})
diff --git a/common/models/transducer/loss.py b/common/models/transducer/loss.py
@@ -1,18 +1,19 @@
-
 from returnn.tf.util.data import Data
-
-
-def rnnt_loss(source, **_kwargs):
-  """
-  Computes the RNN-T loss function. Native TF kernel implementation.
-
-  :param log_prob:
-  :return:
+import tensorflow as tf
+
+
+def rnnt_loss(source, **_kwargs) -> tf.Tensor:
+  """ Computes the RNN-T loss function. Native TF kernel implementation.
+  B: batch, T: time, U:target/labels, V: vocabulary
+  Args:
+      source: function (i: int, as_data: bool = False, ...) -> tf.Tensor|Data
+              which returns one of:
+        output_log_prob: [B, T, U+1, V] log-probabilities
+        target: [B, U] -> [V] target labels
+        base:encoder: [B, T, Feat] -> [V] encoder output
+  Returns:
+      costs: [B]
   """
-  # acts: (B, T, U + 1, V)
-  # targets: (B, T)
-  # input_lengths (B,)
-  # label_lengths (B,)
   from returnn.extern.HawkAaronWarpTransducer import rnnt_loss
 
   log_probs = source(0, as_data=True, auto_convert=False)
@@ -31,17 +32,18 @@ def rnnt_loss(source, **_kwargs):
   return costs
 
 
-def rnnt_tf_loss(source, **kwargs):
-  """
-  Computes the RNN-T loss function. Pure TF.
-
-  :param log_prob:
-  :return:
+def rnnt_tf_loss(source, **kwargs) -> tf.Tensor:
+  """ Computes the RNN-T loss function. Pure TF.
+  B: batch, T: time, U:target/labels, V: vocabulary
+  Args:
+      source: function (i: int, as_data: bool = False, ...) -> tf.Tensor|Data
+              which returns one of:
+        output_log_prob: [B, T, U+1, V] log-probabilities
+        target: [B, U] -> [V] target labels
+        base:encoder: [B, T, Feat] -> [V] encoder output
+  Returns:
+      costs: [B]
   """
-  # acts: (B, T, U + 1, V)
-  # targets: (B, T)
-  # input_lengths (B,)
-  # label_lengths (B,)
   from .rnnt_align_sum_max_pure_tf import tf_forward_shifted_rnnt
 
   log_probs = source(0, as_data=True, auto_convert=False)
@@ -60,20 +62,18 @@ def rnnt_tf_loss(source, **kwargs):
   return costs
 
 
-def rnnt_loss_out_type(**_kwargs) -> Data:
-  return Data(name="rnnt_loss", shape=())
-
-
-def rna_tf_loss(source, **kwargs):
-  """
-  Computes the RNA loss. Pure TF.
-  :param log_prob:
-  :return:
+def rna_tf_loss(source, **kwargs) -> tf.Tensor:
+  """ Computes the RNA loss. Pure TF.
+  B: batch, T: time, U:target/labels, V: vocabulary
+  Args:
+      source: function (i: int, as_data: bool = False, ...) -> tf.Tensor|Data
+              which returns one of:
+        output_log_prob: [B, T, U+1, V] log-probabilities
+        target: [B, U] -> [V] target labels
+        base:encoder: [B, T, Feat] -> [V] encoder output
+  Returns:
+      costs: [B]
   """
-  # acts: (B, T, U, V)
-  # targets: (B, U-1)
-  # input_lengths (B,)
-  # label_lengths (B,)
   from .rna_align_sum_max_pure_tf import tf_forward_shifted_rna
   from returnn.tf.compat import v1 as tf
 
@@ -92,5 +92,9 @@ def rna_tf_loss(source, **kwargs):
   return costs
 
 
+def rnnt_loss_out_type(**_kwargs) -> Data:
+  return Data(name="rnnt_loss", shape=())
+
+
 def rna_loss_out_type(**_kwargs) -> Data:
   return Data(name="rnna_loss", shape=())
diff --git a/common/models/transducer/topology.py b/common/models/transducer/topology.py
@@ -0,0 +1,53 @@
+from .loss import (rnnt_loss, rnnt_loss_out_type,
+                   rnnt_tf_loss, rna_tf_loss, rna_loss_out_type)
+from .alignment import (rnnt_alignment, rnnt_alignment_out_type,
+                        rna_alignment, rna_alignment_out_type)
+
+
+class Topology:
+  """
+  Hold informations about different label topologies such as loss-, alignment-funcion and their out_types.
+  loss and alignment functions are to be used in eval like layers that return a source function.
+
+  The loss, alignment_out_type and alignment function all are to be used in EvalLayers.
+  taking from layers that output the followings:
+        output_log_prob: [B, T, U+1, V] log-probabilities
+        target: [B, U] -> [V] target labels
+        base:encoder: [B, T, Feat] -> [V] encoder output
+  where
+        B: batch, T: time, U:target/labels, V: vocabulary, U': seq_len of alignment
+  EvalLayer offers a source() callback, which has to be used to get the mentioned data.
+  """
+  def __init__(self,
+               name: str,
+               loss,
+               loss_out_type,
+               alignment,
+               alignment_out_type):
+    """ Label Topology such as rnnt, rna, ctc.
+    Args:
+        loss: function (source: (i: int, as_data: bool = False, ...) -> tf.Tensor|Data, ...) -> tf.Tensor[B]
+        loss_out_type: function (...) -> Data[B]
+        alignment: function (source: (i: int, as_data: bool = False, ...) -> tf.Tensor|Data, ...) -> tf.Tensor[B,U']
+        alignment_out_type: function (sources: list[LayerBase], ...) -> Data[B,U']
+    """
+    self.name = name
+    self.loss = loss
+    self.loss_out_type = loss_out_type
+    self.alignment = alignment
+    self.alignment_out_type = alignment_out_type
+
+
+rna_topology = Topology(
+  name="rna",
+  loss=rna_tf_loss,
+  loss_out_type=rna_loss_out_type,
+  alignment=rna_alignment,
+  alignment_out_type=rna_alignment_out_type)
+
+rnnt_topology = Topology(
+  name="rnnt",
+  loss=rnnt_tf_loss,
+  loss_out_type=rnnt_loss_out_type,
+  alignment=rnnt_alignment,
+  alignment_out_type=rnnt_alignment_out_type,)