Fix a bug with computing the output mask after generate (#1029)

mattdangerw · web-flow · commit f543d94a4e03 · 2023-05-10T17:51:27.000-07:00
We were calling cumsum with the wrong axis, meaning we were not
correctly masking all positions after an end token.
diff --git a/keras_nlp/models/gpt2/gpt2_causal_lm.py b/keras_nlp/models/gpt2/gpt2_causal_lm.py
@@ -363,7 +363,7 @@ def next(prompt, cache, index):
             end_locations = (token_ids == end_token_id) & (~padding_mask)
             end_locations = tf.cast(end_locations, tf.int32)
             # Use cumsum to get ones in all locations after end_locations.
-            overflow = tf.math.cumsum(end_locations, exclusive=True)
+            overflow = tf.math.cumsum(end_locations, exclusive=True, axis=-1)
             # Our padding mask is the inverse of these overflow locations.
             padding_mask = ~tf.cast(overflow, tf.bool)
         else:
diff --git a/keras_nlp/models/gpt2/gpt2_causal_lm_test.py b/keras_nlp/models/gpt2/gpt2_causal_lm_test.py
@@ -14,7 +14,9 @@
 """Tests for GPT2 causal LM model."""
 
 import os
+from unittest.mock import patch
 
+import numpy as np
 import pytest
 import tensorflow as tf
 from absl.testing import parameterized
@@ -54,8 +56,8 @@ def setUp(self):
             vocabulary_size=self.preprocessor.tokenizer.vocabulary_size(),
             num_layers=2,
             num_heads=2,
-            hidden_dim=64,
-            intermediate_dim=128,
+            hidden_dim=4,
+            intermediate_dim=8,
             max_sequence_length=self.preprocessor.packer.sequence_length,
         )
         self.causal_lm = GPT2CausalLM(
@@ -118,6 +120,23 @@ def test_generate(self):
             self.preprocessed_batch["padding_mask"][:, :5],
         )
 
+    def test_early_stopping(self):
+        call_with_cache = self.causal_lm.call_with_cache
+
+        def wrapper(*args, **kwargs):
+            """Modify output logits to always favor end_token_id"""
+            logits, hidden_states, cache = call_with_cache(*args, **kwargs)
+            logits = np.zeros(logits.shape.as_list())
+            logits[:, :, self.preprocessor.tokenizer.end_token_id] = 1.0e9
+            return logits, hidden_states, cache
+
+        with patch.object(self.causal_lm, "call_with_cache", wraps=wrapper):
+            prompt = [" airplane at airport", " airplane"]
+            output = self.causal_lm.generate(prompt)
+            # We should immediately abort and output the prompt.
+            self.assertEqual(prompt, output)
+            self.assertEqual(self.causal_lm.call_with_cache.call_count, 2)
+
     def test_generate_compilation(self):
         # Assert we do not recompile with successive calls.
         self.causal_lm.generate(self.raw_batch)
diff --git a/keras_nlp/models/opt/opt_causal_lm.py b/keras_nlp/models/opt/opt_causal_lm.py
@@ -358,7 +358,7 @@ def next(prompt, cache, index):
             end_locations = (token_ids == end_token_id) & (~padding_mask)
             end_locations = tf.cast(end_locations, tf.int32)
             # Use cumsum to get ones in all locations after end_locations.
-            overflow = tf.math.cumsum(end_locations, exclusive=True)
+            overflow = tf.math.cumsum(end_locations, exclusive=True, axis=-1)
             # Our padding mask is the inverse of these overflow locations.
             padding_mask = ~tf.cast(overflow, tf.bool)
         else:
diff --git a/keras_nlp/models/opt/opt_causal_lm_test.py b/keras_nlp/models/opt/opt_causal_lm_test.py
@@ -14,7 +14,9 @@
 """Tests for OPT causal LM model."""
 
 import os
+from unittest.mock import patch
 
+import numpy as np
 import pytest
 import tensorflow as tf
 from absl.testing import parameterized
@@ -60,8 +62,8 @@ def setUp(self):
             vocabulary_size=self.preprocessor.tokenizer.vocabulary_size(),
             num_layers=2,
             num_heads=2,
-            hidden_dim=64,
-            intermediate_dim=128,
+            hidden_dim=4,
+            intermediate_dim=8,
             max_sequence_length=self.preprocessor.packer.sequence_length,
         )
         self.causal_lm = OPTCausalLM(
@@ -124,6 +126,23 @@ def test_generate(self):
             self.preprocessed_batch["padding_mask"][:, :5],
         )
 
+    def test_early_stopping(self):
+        call_with_cache = self.causal_lm.call_with_cache
+
+        def wrapper(*args, **kwargs):
+            """Modify output logits to always favor end_token_id"""
+            logits, hidden_states, cache = call_with_cache(*args, **kwargs)
+            logits = np.zeros(logits.shape.as_list())
+            logits[:, :, self.preprocessor.tokenizer.end_token_id] = 1.0e9
+            return logits, hidden_states, cache
+
+        with patch.object(self.causal_lm, "call_with_cache", wraps=wrapper):
+            prompt = [" airplane at airport", " airplane"]
+            output = self.causal_lm.generate(prompt)
+            # We should immediately abort and output the prompt.
+            self.assertEqual(prompt, output)
+            self.assertEqual(self.causal_lm.call_with_cache.call_count, 2)
+
     def test_generate_compilation(self):
         # Assert we do not recompile with successive calls.
         self.causal_lm.generate(self.raw_batch)