fix: repetition penalty bug if EOS and PAD tokens have the same id

tjohnson31415 · njhill · commit 2470e7d46669 · 2023-11-16T14:56:23.000-08:00
Since the decoding vectorization changes, the pad tokens are also passed in to the repetition penalty processor. In the case where the pad token id is equal to the EOS token id.

This bug was found when testing with the `EleutherAI/gpt-neox-20b` model in TGIS. Having pad token id == eos token id does not seem to be that common, but it is also the fallback if the pad token cannot be found another way.

There's also a little optimization change in this PR which is to pass a view over all_input_ids_tensor into `next_token_chooser` to avoid processing all of the pre-allocated output slots that have the pad token.

Signed-off-by: Travis Johnson &lt;tsjohnso@us.ibm.com&gt;
diff --git a/integration_tests/test_cases_tinystarcoderpy.yaml b/integration_tests/test_cases_tinystarcoderpy.yaml
@@ -47,6 +47,54 @@
         stopReason: MAX_TOKENS
         text: "\nclass Shape(object):\n    '''Shape class'''\n\n    def __init__(self, x, y, z):"
 
+# Regression test case for a bug that was found with the vectorization changes
+# If a model has eos_token_id == pad_token_id, we need to make sure that the
+# repetition penalty doesn't penalize the EOS token score just because the
+# all_input_ids_tensor has padding.
+# See also https://github.ibm.com/ai-foundation/fmaas-inference-server/pull/399
+#
+# First, see what the output would be with no padding in the request
+- name: Regression Test - don't penalize EOS because of PAD [1]
+  request:
+    params:
+      method: GREEDY
+      stopping:
+        maxNewTokens: 10
+      decoding:
+        repetition_penalty: 100
+    requests:
+      - &hello_request {"text": "def print_hello():\n\t"}
+  response:
+    responses:
+      - &hello_response
+        generatedTokenCount: 8
+        inputTokenCount: 6
+        stopReason: EOS_TOKEN
+        text: "\tprint(\"Hello World!\")\n"
+# we should get the same result with padding
+- name: Regression Test - don't penalize EOS because of PAD [2]
+  request:
+    params:
+      method: GREEDY
+      stopping:
+        maxNewTokens: 10
+      decoding:
+        repetition_penalty: 100
+    requests:
+      - *hello_request
+      # need two requests, since there is no padding with a one request batch...
+      # the second request needs to be longer than the first and generate more
+      # than one token as well so that the first is processed with padding
+      - {"text": "# write a function that prints hello world"}
+  response:
+    responses:
+      - *hello_response
+      - generatedTokenCount: 10
+        inputTokenCount: 8
+        stopReason: MAX_TOKENS
+        text: "\ndef print_hello():\n    # create an"
+
+
 # Multiple inputs with token info
 - name: Multiple inputs with token info
   request:
diff --git a/server/text_generation_server/models/causal_lm.py b/server/text_generation_server/models/causal_lm.py
@@ -116,6 +116,7 @@ def from_pb(
         next_token_chooser = HeterogeneousNextTokenChooser.from_pb(
             pb=next_token_chooser_parameters,
             model_eos_token_id=getattr(tokenizer, 'model_eos_token_id', tokenizer.eos_token_id),
+            model_pad_token_id=tokenizer.pad_token_id,
             return_logprobs=return_logprobs,
             dtype=dtype,
             device=device,
@@ -322,6 +323,7 @@ def concatenate(cls, batches: List["CausalLMBatch"]) -> "CausalLMBatch":
         next_token_chooser = HeterogeneousNextTokenChooser.from_pb(
             pb=next_token_chooser_parameters,
             model_eos_token_id=batches[0].next_token_chooser.eos_token_id,
+            model_pad_token_id=batches[0].next_token_chooser.pad_token_id,
             return_logprobs=ntc_return_logprobs,
             dtype=batches[0].next_token_chooser.dtype,
             device=batches[0].next_token_chooser.device,
@@ -638,7 +640,7 @@ def generate_token(
 
         # Heterogeneous next token chooser expects last logits in the sequence
         next_input_ids, next_token_scores, next_token_logprobs = batch.next_token_chooser(
-            input_ids=batch.all_input_ids_tensor, scores=logits[:, -1, :]
+            input_ids=batch.all_input_ids_tensor[:, : -batch.padding_right_offset], scores=logits[:, -1, :]
         )
 
         # Generated tokens
diff --git a/server/text_generation_server/models/flash_causal_lm.py b/server/text_generation_server/models/flash_causal_lm.py
@@ -171,6 +171,7 @@ def from_pb(
         next_token_chooser = HeterogeneousNextTokenChooser.from_pb(
             pb=next_token_chooser_parameters,
             model_eos_token_id=getattr(tokenizer, 'model_eos_token_id', tokenizer.eos_token_id),
+            model_pad_token_id=tokenizer.pad_token_id,
             return_logprobs=return_logprobs,
             dtype=dtype,
             device=device,
@@ -258,6 +259,7 @@ def concatenate(cls, batches: List["FlashCausalLMBatch"]) -> "FlashCausalLMBatch
         next_token_chooser = HeterogeneousNextTokenChooser.from_pb(
             pb=next_token_chooser_parameters,
             model_eos_token_id=first_next_token_chooser.eos_token_id,
+            model_pad_token_id=first_next_token_chooser.pad_token_id,
             return_logprobs=ntc_return_logprobs,
             dtype=first_next_token_chooser.dtype,
             device=first_next_token_chooser.device,
@@ -521,7 +523,7 @@ def _process_new_tokens(
             logits = out
 
         next_token_ids, next_token_scores, next_token_logprobs = batch.next_token_chooser(
-            input_ids=batch.all_input_ids_tensor, scores=logits,
+            input_ids=batch.all_input_ids_tensor[:, :batch.max_seqlen], scores=logits,
         )
 
         # add the next token ids to all_input_ids_tensor
diff --git a/server/text_generation_server/models/seq2seq_lm.py b/server/text_generation_server/models/seq2seq_lm.py
@@ -216,6 +216,7 @@ def from_pb(
         next_token_chooser = HeterogeneousNextTokenChooser.from_pb(
             pb=next_token_chooser_parameters,
             model_eos_token_id=getattr(tokenizer, 'model_eos_token_id', tokenizer.eos_token_id),
+            model_pad_token_id=tokenizer.pad_token_id,
             return_logprobs=return_logprobs,
             dtype=dtype,
             device=device
@@ -432,6 +433,7 @@ def concatenate(cls, batches: List["Seq2SeqLMBatch"]) -> "Seq2SeqLMBatch":
         next_token_chooser = HeterogeneousNextTokenChooser.from_pb(
             pb=next_token_chooser_parameters,
             model_eos_token_id=batches[0].next_token_chooser.eos_token_id,
+            model_pad_token_id=batches[0].next_token_chooser.pad_token_id,
             return_logprobs=ntc_return_logprobs,
             dtype=batches[0].next_token_chooser.dtype,
             device=batches[0].next_token_chooser.device,
@@ -644,7 +646,7 @@ def generate_token(
         )
 
         next_input_ids, next_token_scores, next_token_logprobs = batch.next_token_chooser(
-            input_ids=batch.all_decoder_input_ids_tensor, scores=logits[:, -1, :]
+            input_ids=batch.all_decoder_input_ids_tensor[:, : - batch.padding_right_offset], scores=logits[:, -1, :]
         )
 
         # Generated tokens
diff --git a/server/text_generation_server/utils/logits_process.py b/server/text_generation_server/utils/logits_process.py
@@ -102,13 +102,22 @@ class HeterogeneousRepetitionPenaltyLogitsProcessor(LogitsProcessor):
             paper](https://arxiv.org/pdf/1909.05858.pdf) for more details.
     """
 
-    def __init__(self, penalty: List[float], dtype: torch.dtype, device: torch.device):
+    def __init__(self, penalty: List[float], dtype: torch.dtype, device: torch.device, id_to_exclude: Optional[int] = None):
         self.penalty = penalty
         self.penalty_tensor = torch.tensor(
             penalty, dtype=dtype, device=device
         ).unsqueeze(1)
+        self.id_to_exclude = id_to_exclude
 
     def __call__(self, input_ids: torch.Tensor, scores: torch.Tensor) -> torch.Tensor:
+        # as an optimization for a common case, we skip the exclusion if there is only
+        # one request in the batch (assumes that there is no padding with a single request)
+        do_exclude = self.id_to_exclude is not None and input_ids.shape[0] != 1
+        # save out the original scores if we are excluding an id
+        if do_exclude:
+            # tensor is updated in-place, so need to clone here
+            scores_of_id_to_exclude = scores[:, self.id_to_exclude].clone()
+
         score = torch.gather(scores, 1, input_ids)
 
         # if score < 0 then repetition penalty has to be multiplied to reduce the previous token probability
@@ -117,6 +126,11 @@ def __call__(self, input_ids: torch.Tensor, scores: torch.Tensor) -> torch.Tenso
         )
 
         scores.scatter_(1, input_ids, score)
+
+        # restore the scores for the "excluded" id
+        if do_exclude:
+            scores[:, self.id_to_exclude] = scores_of_id_to_exclude
+
         return scores
 
     def filter(self, indices):
diff --git a/server/text_generation_server/utils/tokens.py b/server/text_generation_server/utils/tokens.py
@@ -173,6 +173,7 @@ def __init__(
         min_new_tokens: List[int],
         return_logprobs: List[bool],
         eos_token_id: Optional[int] = None,
+        pad_token_id: Optional[int] = None,
         device: Optional[torch.device] = None,
         dtype: torch.dtype = None,
         # allow passing in existing values to support combining HNTC instances
@@ -181,7 +182,9 @@ def __init__(
         warpers = []
         self.repetition_processor = (
             HeterogeneousRepetitionPenaltyLogitsProcessor(
-                repetition_penalty, dtype, device
+                repetition_penalty, dtype, device,
+                # do not penalize the eos token if it is the same id as the pad token
+                id_to_exclude = eos_token_id if eos_token_id == pad_token_id else None,
             )
             if any(x != 1.0 for x in repetition_penalty)
             else None
@@ -215,6 +218,7 @@ def __init__(
 
         self.warpers = warpers
         self.eos_token_id = eos_token_id
+        self.pad_token_id = pad_token_id
         self.length_penalty = length_penalty
         self.min_new_tokens = min_new_tokens
         self.current_tokens = current_tokens if current_tokens is not None else [0] * len(do_sample)
@@ -267,6 +271,7 @@ def from_pb(
         cls,
         pb: List[generate_pb2.NextTokenChooserParameters],
         model_eos_token_id: Optional[int],
+        model_pad_token_id: Optional[int],
         return_logprobs: List[bool],
         dtype: torch.dtype,
         device: torch.device,
@@ -291,6 +296,7 @@ def from_pb(
             seeds=seeds,
             min_new_tokens=[pb_.min_new_tokens for pb_ in pb],
             eos_token_id=model_eos_token_id,
+            pad_token_id=model_pad_token_id,
             return_logprobs=return_logprobs,
             device=device,
             dtype=dtype,