fix: Fix typical_p value edge case causing tensor OOB errors

njhill · njhill · commit addc7140a72d · 2023-10-13T15:15:25.000-07:00
In TypicalLogitsWarper we need to ensure that last_ind is a valid index for the vocab size.

In low precision situations, it can end up that the cumulative mass covers the entire vocab even when it is &lt; 1 (specifically observed for typical_p = 0.99 with bfloat16).

Also clean up patched version of TopPLogitsWarper now that the fix is in transformers.
diff --git a/router/src/queue.rs b/router/src/queue.rs
@@ -411,9 +411,8 @@ impl From<&GenerateParameters> for NextTokenChooserParameters {
                 theta => Some(theta),
             },
             length_penalty: parameters.length_penalty
-                .map(|lp| LengthPenalty {
-                    start_index: lp.0,
-                    decay_factor: lp.1,
+                .map(|(start_index, decay_factor)| LengthPenalty {
+                    start_index, decay_factor
                 }),
         }
     }
diff --git a/server/text_generation_server/utils/logits_process.py b/server/text_generation_server/utils/logits_process.py
@@ -11,7 +11,7 @@
     LogitsProcessor,
     TemperatureLogitsWarper,
     TopKLogitsWarper,
-    #TopPLogitsWarper,
+    TopPLogitsWarper,
     #TypicalLogitsWarper,
 )
 
@@ -419,46 +419,7 @@ def filter(self, indices):
         return None
 
 
-
-
-# These are fixed versions of the classese in transformers. Remove them after upgrading to transformers >= 4.30.
-# See https://github.com/huggingface/transformers/pull/24111
-class TopPLogitsWarper(LogitsWarper):
-    """
-    [`LogitsWarper`] that performs top-p, i.e. restricting to top tokens summing to prob_cut_off <= prob_cut_off.
-
-    Args:
-        top_p (`float`):
-            If set to < 1, only the smallest set of most probable tokens with probabilities that add up to `top_p` or
-            higher are kept for generation.
-        filter_value (`float`, *optional*, defaults to `-float("Inf")`):
-            All filtered values will be set to this float value.
-        min_tokens_to_keep (`int`, *optional*, defaults to 1):
-            Minimum number of tokens that cannot be filtered.
-    """
-
-    def __init__(self, top_p: float, filter_value: float = -float("Inf"), min_tokens_to_keep: int = 1):
-        top_p = float(top_p)
-        if top_p < 0 or top_p > 1.0:
-            raise ValueError(f"`top_p` has to be a float > 0 and < 1, but is {top_p}")
-
-        self.top_p = top_p
-        self.filter_value = filter_value
-        self.min_tokens_to_keep = min_tokens_to_keep
-
-    def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor) -> torch.FloatTensor:
-        sorted_logits, sorted_indices = torch.sort(scores, descending=False)
-        cumulative_probs = sorted_logits.softmax(dim=-1).cumsum(dim=-1)
-
-        # Remove tokens with cumulative top_p above the threshold (token with 0 are kept)
-        sorted_indices_to_remove = cumulative_probs <= (1 - self.top_p)
-        sorted_indices_to_remove[..., -self.min_tokens_to_keep :] = 0
-
-        # scatter sorted tensors to original indexing
-        indices_to_remove = sorted_indices_to_remove.scatter(1, sorted_indices, sorted_indices_to_remove)
-        scores = scores.masked_fill(indices_to_remove, self.filter_value)
-        return scores
-
+# This is a fixed version of the class in transformers. Can be moved once we contribute back the fix and upgrade.
 class TypicalLogitsWarper(LogitsWarper):
     r"""
     [`LogitsWarper`] that performs typical decoding. See [Typical Decoding for Natural Language
@@ -475,7 +436,7 @@ class TypicalLogitsWarper(LogitsWarper):
 
     def __init__(self, mass: float = 0.9, filter_value: float = -float("Inf"), min_tokens_to_keep: int = 1):
         mass = float(mass)
-        if not (mass > 0 and mass < 1):
+        if not (0 < mass < 1):
             raise ValueError(f"`typical_p` has to be a float > 0 and < 1, but is {mass}")
 
         self.filter_value = filter_value
@@ -496,11 +457,11 @@ def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor) -> to
 
         # Remove tokens with cumulative mass above the threshold
         last_ind = (cumulative_probs < self.mass).sum(dim=1)
-        last_ind[last_ind < 0] = 0
+        last_ind.clamp_(0, sorted_scores.shape[-1] - 1)
         sorted_indices_to_remove = sorted_scores > sorted_scores.gather(1, last_ind.view(-1, 1))
         # Keep at least min_tokens_to_keep (set to min_tokens_to_keep-1 because we add the first one below)
         sorted_indices_to_remove[..., : self.min_tokens_to_keep] = 0
         indices_to_remove = sorted_indices_to_remove.scatter(1, sorted_indices, sorted_indices_to_remove)
 
         scores = scores.masked_fill(indices_to_remove, self.filter_value)
-        return scores
+        return scores

Original file line number	Diff line number	Diff line change
`@@ -411,9 +411,8 @@ impl From<&GenerateParameters> for NextTokenChooserParameters {`
`411`	`411`	`theta => Some(theta),`
`412`	`412`	`},`
`413`	`413`	`length_penalty: parameters.length_penalty`
`414`		`- .map(\|lp\| LengthPenalty {`
`415`		`- start_index: lp.0,`
`416`		`- decay_factor: lp.1,`
	`414`	`+ .map(\|(start_index, decay_factor)\| LengthPenalty {`
	`415`	`+ start_index, decay_factor`
`417`	`416`	`}),`
`418`	`417`	`}`
`419`	`418`	`}`