Refactor the way large initial numbers are used so that they don't cause overflow.

lingvo-bot · copybara-github · commit 974f60f67bda · 2022-11-28T16:59:20.000-08:00
PiperOrigin-RevId: 491478236
diff --git a/lingvo/core/attention.py b/lingvo/core/attention.py
@@ -394,8 +394,8 @@ def _PaddedSoftmax(self, logits, padding):
     assert logits.dtype.is_floating
     assert hasattr(logits.dtype, 'max')
     very_negative_logits = (
-        tf.ones_like(logits) * logits.dtype.max *
-        tf.constant(-0.7, dtype=logits.dtype))
+        tf.ones_like(logits) *
+        tf.constant(-0.7 * logits.dtype.max, dtype=logits.dtype))
     if self.do_eval:
       very_negative_logits = self.QAct('logits', very_negative_logits)
     padded_logits = tf.where(padding > 0.0, very_negative_logits, logits)
diff --git a/lingvo/core/attention_util.py b/lingvo/core/attention_util.py
@@ -786,7 +786,7 @@ def FProp(self, theta, x, paddings=None, update=False):
 
     # For padded positions we update the distances to very large numbers.
     very_large_dists = tf.ones_like(dists) * tf.constant(
-        0.1, dtype=dists.dtype) * dists.dtype.max
+        0.1 * dists.dtype.max, dtype=dists.dtype)
     paddings_tiled = tf.tile(paddings_4d, [1, 1, p.num_heads, p.num_clusters])
     dists = tf.where(paddings_tiled > 0.0, very_large_dists, dists)
 
@@ -977,8 +977,8 @@ def ComputeSparseAttention(q, k, v, sparsity_indices, paddings=None):
   logits *= tf.math.rsqrt(tf.cast(dim_per_head, q.dtype))
 
   very_negative_logits = (
-      tf.ones_like(logits) * logits.dtype.max *
-      tf.constant(-0.7, dtype=logits.dtype))
+      tf.ones_like(logits) *
+      tf.constant(-0.7 * logits.dtype.max, dtype=logits.dtype))
   padded_logits = tf.where(
       tf.math.logical_or(sparsity_indices < 0, paddings > 0.0),
       very_negative_logits, logits)
diff --git a/lingvo/core/batch_major_attention_test.py b/lingvo/core/batch_major_attention_test.py
@@ -327,8 +327,8 @@ def testMultiHeadedAttentionDotProductSegmentMask(self):
       segment_id = tf.zeros([6, 6])
       segment_mask = attention.SegmentMask(segment_id, segment_id)
       padding = tf.tile(tf.reshape(input_padding, [6, 1, 1, 6]), [1, 1, 6, 1])
-      padding_mask = padding * segment_mask.dtype.max * tf.constant(
-          -0.7, dtype=segment_mask.dtype)
+      padding_mask = padding * tf.constant(
+          -0.7 * segment_mask.dtype.max, dtype=segment_mask.dtype)
       segment_mask += padding_mask
 
       l = p.Instantiate()
diff --git a/lingvo/core/conv_layers_builder.py b/lingvo/core/conv_layers_builder.py
@@ -95,7 +95,7 @@ def FProp(self, theta, inputs, paddings):
 
     window_size = p.left_context
     left_pad_size = window_size - 1
-    large_negative = p.dtype.max * tf.constant(-0.7, dtype=p.dtype)
+    large_negative = tf.constant(-0.7 * p.dtype.max, dtype=p.dtype)
     # For max pooling, use a large negative padding value such that the max
     # element is almost always from a non-padding position.
     pad_value = 0 if p.pooling_type == 'AVG' else large_negative
diff --git a/lingvo/core/conv_layers_with_time_padding.py b/lingvo/core/conv_layers_with_time_padding.py
@@ -1006,7 +1006,7 @@ def FProp(self, theta, inputs, paddings):
       out_feature = global_sum / tf.maximum(1.0, count)
     elif p.pooling_type == 'MAX':
       large_negative = (
-          tf.ones_like(inputs) * p.dtype.max * tf.constant(-0.7, dtype=p.dtype))
+          tf.ones_like(inputs) * tf.constant(-0.7 * p.dtype.max, dtype=p.dtype))
       padded_inputs = tf.where_v2(mask > 0.0, inputs, large_negative)
       out_feature = tf.reduce_max(padded_inputs, axis=[1, 2], keepdims=True)
     if paddings is None:
diff --git a/lingvo/core/gshard_layers.py b/lingvo/core/gshard_layers.py
@@ -2121,8 +2121,8 @@ def _CreateOverCapacityRatioSummary(mask, position_in_expert, capacity, name):
     # Generates standard Gumbel(0, 1) noise, GSE Tensors
     noise = -tf.math.log(-tf.math.log(noise))
     very_negative_logits = _MaybeSplit(
-        (tf.ones_like(logits) * logits.dtype.max *
-         tf.constant(-0.7, dtype=logits.dtype)))
+        (tf.ones_like(logits) *
+         tf.constant(-0.7 * logits.dtype.max, dtype=logits.dtype)))
     # Gets rid of the first expert by setting its logit to be very negative
     updated_logits = _MaybeSplit(
         tf.where(mask_1 > 0.0, very_negative_logits, logits))
diff --git a/lingvo/tasks/mt/decoder.py b/lingvo/tasks/mt/decoder.py
@@ -1170,8 +1170,8 @@ def _ForceAlignment(self, log_probs, source_num_sentences, hyp_num_sentences):
     # the current hyp contains fewer sentences than expected to disallow
     # eos in such misaligned cases.
     large_negative_value = tf.ones_like(log_probs[:, eos_id]) * tf.constant(
-        -self._FLOAT_DTYPE_MAX_SCALER,
-        dtype=log_probs.dtype) * log_probs.dtype.max
+        -self._FLOAT_DTYPE_MAX_SCALER * log_probs.dtype.max,
+        dtype=log_probs.dtype)
     eos_log_probs = tf.where(
         tf.math.greater(source_num_sentences, hyp_num_sentences),
         large_negative_value, log_probs[:, eos_id])
@@ -1214,8 +1214,8 @@ def _UpdateLogitsForSingleTokenFastDecode(self, log_probs, is_single_token,
     is_eos = tf.math.equal(tf.range(v), tf.ones_like(tf.range(v)) * eos_id)
     is_eos = tf.tile(tf.expand_dims(is_eos, 0), [b, 1])
     large_neg_probs = tf.ones_like(log_probs) * tf.constant(
-        -self._FLOAT_DTYPE_MAX_SCALER,
-        dtype=log_probs.dtype) * log_probs.dtype.max
+        -self._FLOAT_DTYPE_MAX_SCALER * log_probs.dtype.max,
+        dtype=log_probs.dtype)
     new_log_probs = tf.where(is_eos, tf.zeros_like(large_neg_probs),
                              large_neg_probs)
     return tf.where(is_single_token_2d, new_log_probs, log_probs)