Some minor token processing logic cleanup

njhill · njhill · commit 642041d53762 · 2023-11-14T18:33:27.000-08:00
From recent code observations
diff --git a/router/src/server.rs b/router/src/server.rs
@@ -207,7 +207,7 @@ impl<B: BatchType> BatchConfigValidator<B> {
             );
             if max_prefill_weight < single_request_prefill_weight {
                 panic!(
-                    "max_prefill_weight ({}) not large enough for max_sequence_length ({}",
+                    "max_prefill_weight ({}) not large enough for max_sequence_length ({})",
                     max_prefill_weight, max_sequence_length
                 )
             }
diff --git a/server/text_generation_server/utils/logits_process.py b/server/text_generation_server/utils/logits_process.py
@@ -121,10 +121,10 @@ def __call__(self, input_ids: torch.Tensor, scores: torch.Tensor) -> torch.Tenso
 
     def filter(self, indices):
         self.penalty = [self.penalty[i] for i in indices]
-        if any([x != 1.0 for x in self.penalty]):
-            self.penalty_tensor = self.penalty_tensor[indices]
-            return self
-        return None
+        if all(x == 1.0 for x in self.penalty):
+            return None
+        self.penalty_tensor = self.penalty_tensor[indices]
+        return self
 
 
 class HeterogeneousTemperatureLogitsWarper:
@@ -152,10 +152,10 @@ def __call__(self, input_ids: torch.Tensor, scores: torch.Tensor) -> torch.Tenso
 
     def filter(self, indices):
         self.temperature = [self.temperature[i] for i in indices]
-        if any([x != 1.0 for x in self.temperature]):
-            self.temperature_tensor = self.temperature_tensor[indices]
-            return self
-        return None
+        if all(x == 1.0 for x in self.temperature):
+            return None
+        self.temperature_tensor = self.temperature_tensor[indices]
+        return self
 
 
 class HeterogeneousTopPLogitsWarper(LogitsWarper):
@@ -211,10 +211,10 @@ def __call__(self, input_ids: torch.Tensor, scores: torch.Tensor) -> torch.Tenso
 
     def filter(self, indices):
         self.top_p = [self.top_p[i] for i in indices]
-        if any([x < 1.0 for x in self.top_p]):
-            self.top_p_opposite = self.top_p_opposite[indices]
-            return self
-        return None
+        if all(x == 1.0 for x in self.top_p):
+            return None
+        self.top_p_opposite = self.top_p_opposite[indices]
+        return self
 
 
 class HeterogeneousTopKLogitsWarper(LogitsWarper):
@@ -270,7 +270,7 @@ def __call__(self, input_ids: torch.Tensor, scores: torch.Tensor) -> torch.Tenso
             top_k = self.top_k_tensor
 
         # Get the kth score for each member of the batch
-        kth_scores = torch.gather(torch.topk(scores, max_top_k)[0], 1, top_k)
+        kth_scores = torch.gather(torch.topk(scores, max_top_k).values, 1, top_k)
 
         # Mask member of kth_scores that do not want to use top_k warping
         if self.top_k_disabled_mask is not None:
@@ -376,16 +376,16 @@ def filter(self, indices):
         self.mass = [self.mass[i] for i in indices]
         disabled = [x == 1.0 for x in self.mass]
 
-        if not all(disabled):
-            self.mass_tensor = self.mass_tensor[indices]
+        if all(disabled):
+            return None
 
-            if self.disabled_mask is not None:
-                self.disabled_mask = (
-                    self.disabled_mask[indices] if any(disabled) else None
-                )
+        self.mass_tensor = self.mass_tensor[indices]
 
-            return self
-        return None
+        if self.disabled_mask is not None:
+            self.disabled_mask = (
+                self.disabled_mask[indices] if any(disabled) else None
+            )
+        return self
 
 
 # NB: This class is not currently used.
diff --git a/server/text_generation_server/utils/tokens.py b/server/text_generation_server/utils/tokens.py
@@ -202,12 +202,13 @@ def __init__(
                 warpers.append(HeterogeneousTopKLogitsWarper(top_k, device))
 
             if any(x < 1.0 for x in top_p):
+                #assert all(x != 0 for x in top_p)
                 warpers.append(HeterogeneousTopPLogitsWarper(top_p, dtype, device))
-            # We specifically exclude degenerate case 0, we devolves into greedy decoding,
-            # to align with the nonvectorized typical logit warping behavior.
-            if any(0.0 < x < 1.0 for x in typical_p):
-                corrected_probs = [p if p != 0 else 1 for p in typical_p]
-                warpers.append(HeterogeneousTypicalLogitsWarper(corrected_probs, dtype, device))
+
+            if any(x < 1.0 for x in typical_p):
+                #assert all(x != 0 for x in typical_p)
+                warpers.append(HeterogeneousTypicalLogitsWarper(typical_p, dtype, device))
+
             self.choice = HeterogeneousSampling(do_sample, seeds, device)
         else:
             self.choice = Greedy()
@@ -279,8 +280,10 @@ def from_pb(
             temperature=[pb_.temperature for pb_ in pb],
             repetition_penalty=[pb_.repetition_penalty if pb_.HasField('repetition_penalty') else 1.0 for pb_ in pb],
             top_k=[pb_.top_k for pb_ in pb],
-            top_p=[pb_.top_p for pb_ in pb],
-            typical_p=[pb_.typical_p for pb_ in pb],
+            # Ensure that default (zero) values for top_p and typical_p are converted to 1.0
+            # (which corresponds to disabled in both cases)
+            top_p=[pb_.top_p if pb_.top_p > 0 else 1.0 for pb_ in pb],
+            typical_p=[pb_.typical_p if pb_.typical_p > 0 else 1.0 for pb_ in pb],
             length_penalty=[
                 (pb_.length_penalty.start_index, pb_.length_penalty.decay_factor)
                 if pb_.HasField('length_penalty') else None for pb_ in pb
@@ -310,7 +313,7 @@ def filter(self, indices):
         self.current_tokens = [self.current_tokens[i] for i in indices]
         self.min_new_tokens = [self.min_new_tokens[i] for i in indices]
         self.length_penalty = [self.length_penalty[i] for i in indices]
-        self.return_logprobs =  [self.return_logprobs[i] for i in indices]
+        self.return_logprobs = [self.return_logprobs[i] for i in indices]
 
         if any(self.do_sample):
             self.choice.filter(indices)
@@ -370,12 +373,13 @@ def filter(self, indices):
         self.samplings = [self.samplings[i] for i in indices]
         return self
 
+
 # Extract requested token information from model output
 def get_token_info(
     request: generate_pb2.Request,
     scores: torch.Tensor,  # Assumes shape is [1, vocab_size]
     next_token: torch.Tensor,
-    logprobs: Optional[torch.Tensor], # Assumes shape matches logits
+    logprobs: Optional[torch.Tensor],  # Assumes shape matches logits
 ) -> TokenInfo:
     next_token = next_token.item()
     token_info = TokenInfo(request_id=request.id, token_id=next_token)
@@ -391,9 +395,8 @@ def get_token_info(
         # Ensure top_n doesn't exceed vocab size
         top_n = min(return_top_n, flat_scores.size(-1))
         # Get nth highest value, ensure it's not -inf (for example if top_n > top_k)
-        nth_highest = torch.topk(flat_scores, top_n)[0][-1]
-        if nth_highest == -float('inf'):
-            nth_highest = torch.finfo(flat_scores.dtype).min
+        nth_highest = flat_scores.topk(top_n).values[-1]
+        torch.nan_to_num_(nth_highest, neginf=torch.finfo(torch.float).min)
         # Get indices (token ids) of all scores >= nth highest value,
         # cap length at 4 * top_n as a precaution
         top_n_indices = (flat_scores >= nth_highest).nonzero().squeeze(-1)[:(top_n * 4)]
@@ -407,7 +410,7 @@ def get_token_info(
     # Token ranks if requested
     if request.details.ranks:
         #TODO if we're also returning top_n perhaps search those first
-        token_info.rank = (scores > scores[0][next_token]).sum() + 1
+        token_info.rank = (scores > scores[0, next_token]).sum() + 1
 
     return token_info
 
@@ -447,7 +450,7 @@ def get_input_tokens_info(request, input_token_ids, all_input_logits) -> InputTo
         # Ensure top_n doesn't exceed vocab size
         top_n = min(top_n, all_input_logits.size(-1))
         # Get the nth highest value for each input token's set of logits
-        nth_highest_values = torch.topk(all_input_logits, top_n)[0][..., -1, None]
+        nth_highest_values = torch.topk(all_input_logits, top_n).values[..., -1, None]
         # Construct bool tensor marking all scores >= nth highest value for each token
         diff = (all_input_logits >= nth_highest_values)
         # Gather set of marked indices for each token (correspond to top token ids)

Original file line number	Diff line number	Diff line change
`@@ -207,7 +207,7 @@ impl<B: BatchType> BatchConfigValidator<B> {`
`207`	`207`	`);`
`208`	`208`	`if max_prefill_weight < single_request_prefill_weight {`
`209`	`209`	`panic!(`
`210`		`- "max_prefill_weight ({}) not large enough for max_sequence_length ({}",`
	`210`	`+ "max_prefill_weight ({}) not large enough for max_sequence_length ({})",`
`211`	`211`	`max_prefill_weight, max_sequence_length`
`212`	`212`	`)`
`213`	`213`	`}`