Simplify handling of truncation with add_bos_token

njhill · njhill · commit 1e8ed0cb075a · 2023-09-28T15:12:26.000-07:00
In non-flash causal_lm and seq2seq_lm cases

Move truncation / bos_token insertion logic before embeddings lookup so that special handling of bos embedding isn't needed

Also update changelog with recent updates

Signed-off-by: Nick Hill &lt;nickhill@us.ibm.com&gt;
diff --git a/server/text_generation_server/models/causal_lm.py b/server/text_generation_server/models/causal_lm.py
@@ -140,6 +140,18 @@ def from_pb(
         # Copy tokenizer attention_mask into fully allocated attention_mask
         attention_mask[:, :tokenize_length] = tokenized_inputs["attention_mask"]
 
+        # Mask out truncated tokens
+        # (input_texts aren't truncated, only input_lengths are)
+        if truncate_indices:
+            add_bos_token = getattr(tokenizer, "add_bos_token", False)
+            for i in truncate_indices:
+                orig_input_length = requests[i].input_length
+                attention_mask[i, :-orig_input_length-padding_right_offset] = 0
+                all_input_ids[i, :-orig_input_length] = tokenizer.pad_token_id
+                if add_bos_token:
+                    # Ensure that first non-virtual token is set to BOS
+                    all_input_ids[i, -orig_input_length] = tokenizer.bos_token_id
+
         if prefix_ids:
             # Get input embeddings
             inputs_embeds = embeddings_lookup(all_input_ids)
@@ -155,24 +167,6 @@ def from_pb(
             input_ids = all_input_ids
             inputs_embeds = None
 
-        # Mask out truncated tokens
-        # (input_texts aren't truncated, only input_lengths are)
-        if truncate_indices:
-            add_bos_token = getattr(tokenizer, "add_bos_token", False)
-            for i in truncate_indices:
-                input_length = input_lengths[i]
-                attention_mask[i, :-input_length-padding_right_offset] = 0
-                if inputs_embeds is not None:
-                    inputs_embeds[i, :-input_length, :] = 0
-                    if add_bos_token:
-                        p = prefix_ids.get(i)
-                        orig_length = input_length if p is None else input_length - p.shape[0]
-                        inputs_embeds[i, -orig_length] = prefix_cache.bos_embedding
-                else:
-                    input_ids[i, :-input_length] = tokenizer.pad_token_id
-                    if add_bos_token:
-                        input_ids[i, -input_length] = tokenizer.bos_token_id
-
         if use_position_ids:
             # Fix up position ids
             sliced_attention_mask = attention_mask[:, :-padding_right_offset]
diff --git a/server/text_generation_server/models/model.py b/server/text_generation_server/models/model.py
@@ -48,10 +48,9 @@ def __init__(self, engine: BaseInferenceEngine, dtype: torch.dtype):
 
         if prompt_prefix_supported:
             # Set up prefix cache
-            bos_token_id = getattr(self.tokenizer, "bos_token_id", None)
             decoder_start_token_id = self.model.config.decoder_start_token_id
             if decoder_start_token_id is None:
-                decoder_start_token_id = bos_token_id
+                decoder_start_token_id = self.tokenizer.bos_token_id
             self.prefix_cache = PrefixCache(
                 device=self.device,
                 dtype=dtype,
@@ -60,9 +59,6 @@ def __init__(self, engine: BaseInferenceEngine, dtype: torch.dtype):
                 decoder_start_tok_embedding=self.word_embeddings(
                     torch.tensor([decoder_start_token_id], device=self.device, dtype=torch.long)
                 ) if decoder_start_token_id is not None else None,
-                bos_embedding=self.word_embeddings(
-                    torch.tensor([bos_token_id], device=self.device, dtype=torch.long)
-                ) if bos_token_id is not None else None,
             )
         else:
             self.prefix_cache = None
diff --git a/server/text_generation_server/models/seq2seq_lm.py b/server/text_generation_server/models/seq2seq_lm.py
@@ -150,6 +150,18 @@ def from_pb(
         input_ids = tokenized_inputs["input_ids"]
         attention_mask = tokenized_inputs["attention_mask"]
 
+        # Mask out truncated tokens
+        # (input_texts aren't truncated, only input_lengths are)
+        if truncate_indices:
+            add_bos_token = getattr(tokenizer, "add_bos_token", False)
+            for i in truncate_indices:
+                orig_input_length = requests[i].input_length
+                attention_mask[i, :-orig_input_length] = 0
+                input_ids[i, :-orig_input_length] = tokenizer.pad_token_id
+                if add_bos_token:
+                    # Ensure that first non-virtual token is set to BOS
+                    input_ids[i, -orig_input_length] = tokenizer.bos_token_id
+
         if encoder_prefix_ids:
             # Get input embeddings
             inputs_embeds = embeddings_lookup(input_ids)
@@ -163,23 +175,6 @@ def from_pb(
         else:
             inputs_embeds = None
 
-        # Mask out truncated tokens
-        # (input_texts aren't truncated, only input_lengths are)
-        if truncate_indices:
-            for i in truncate_indices:
-                add_bos_token = getattr(tokenizer, "add_bos_token", False)
-                input_length = input_lengths[i]
-                attention_mask[i, :-input_length] = 0
-                input_ids[i, :-input_length] = tokenizer.pad_token_id
-                if add_bos_token:
-                    input_ids[i, -input_length] = tokenizer.bos_token_id
-                if inputs_embeds is not None:
-                    inputs_embeds[i, :-input_length, :] = 0
-                    if add_bos_token:
-                        p = encoder_prefix_ids.get(i)
-                        orig_length = input_length if p is None else input_length - p.shape[0]
-                        inputs_embeds[i, -orig_length] = prefix_cache.bos_embedding
-
         if decoder_prefix_ids:
             # Construct decoder embeddings and attention mask
             start_tok_embedding = prefix_cache.decoder_start_tok_embedding
diff --git a/server/text_generation_server/prompt_cache.py b/server/text_generation_server/prompt_cache.py
@@ -150,18 +150,15 @@ def __init__(
         max_length: int,
         encoder_decoder: bool,
         decoder_start_tok_embedding: torch.Tensor,
-        bos_embedding: torch.Tensor,
     ):
         self.max_length = max_length
         self.embed_size = decoder_start_tok_embedding.shape[1] \
-            if decoder_start_tok_embedding is not None else \
-            (bos_embedding.shape[1] if bos_embedding is not None else None)
+            if decoder_start_tok_embedding is not None else None
         self.device: torch.device = device
         self.dtype = dtype
 
         self.is_encoder_decoder = encoder_decoder
         self.decoder_start_tok_embedding = decoder_start_tok_embedding
-        self.bos_embedding = bos_embedding
 
         self.cache_map: Dict[str, PromptCacheNode] = {}
         self.cache_dll: DoublyLinkedList = DoublyLinkedList()