Fix for truncation option for models whose tokenizer adds bos_token

njhill · njhill · commit 7d6ebcfbb35e · 2023-09-28T15:10:46.000-07:00
Notably includes Llama models

Signed-off-by: Nick Hill &lt;nickhill@us.ibm.com&gt;
diff --git a/server/text_generation_server/models/causal_lm.py b/server/text_generation_server/models/causal_lm.py
@@ -157,13 +157,21 @@ def from_pb(
 
         # Mask out truncated tokens
         # (input_texts aren't truncated, only input_lengths are)
-        for i in truncate_indices:
-            input_length = input_lengths[i]
-            attention_mask[i, :-input_length-padding_right_offset] = 0
-            if inputs_embeds is not None:
-                inputs_embeds[i, :-input_length, :] = 0
-            else:
-                input_ids[i, :-input_length] = tokenizer.pad_token_id
+        if truncate_indices:
+            add_bos_token = getattr(tokenizer, "add_bos_token", False)
+            for i in truncate_indices:
+                input_length = input_lengths[i]
+                attention_mask[i, :-input_length-padding_right_offset] = 0
+                if inputs_embeds is not None:
+                    inputs_embeds[i, :-input_length, :] = 0
+                    if add_bos_token:
+                        p = prefix_ids.get(i)
+                        orig_length = input_length if p is None else input_length - p.shape[0]
+                        inputs_embeds[i, -orig_length] = prefix_cache.bos_embedding
+                else:
+                    input_ids[i, :-input_length] = tokenizer.pad_token_id
+                    if add_bos_token:
+                        input_ids[i, -input_length] = tokenizer.bos_token_id
 
         if use_position_ids:
             # Fix up position ids
diff --git a/server/text_generation_server/models/flash_causal_lm.py b/server/text_generation_server/models/flash_causal_lm.py
@@ -107,6 +107,10 @@ def from_pb(
 
             tokenized_input = tokenized_input[-input_length:]
 
+            # Fill in bos token in truncation case if needed
+            if r.truncate and getattr(tokenizer, "add_bos_token", False):
+                tokenized_input[0] = tokenizer.bos_token_id
+
             input_lengths.append(input_length)
 
             tokenized_input = torch.tensor(tokenized_input, device=device)
diff --git a/server/text_generation_server/models/model.py b/server/text_generation_server/models/model.py
@@ -48,9 +48,10 @@ def __init__(self, engine: BaseInferenceEngine, dtype: torch.dtype):
 
         if prompt_prefix_supported:
             # Set up prefix cache
+            bos_token_id = getattr(self.tokenizer, "bos_token_id", None)
             decoder_start_token_id = self.model.config.decoder_start_token_id
             if decoder_start_token_id is None:
-                decoder_start_token_id = self.tokenizer.bos_token_id
+                decoder_start_token_id = bos_token_id
             self.prefix_cache = PrefixCache(
                 device=self.device,
                 dtype=dtype,
@@ -59,6 +60,9 @@ def __init__(self, engine: BaseInferenceEngine, dtype: torch.dtype):
                 decoder_start_tok_embedding=self.word_embeddings(
                     torch.tensor([decoder_start_token_id], device=self.device, dtype=torch.long)
                 ) if decoder_start_token_id is not None else None,
+                bos_embedding=self.word_embeddings(
+                    torch.tensor([bos_token_id], device=self.device, dtype=torch.long)
+                ) if bos_token_id is not None else None,
             )
         else:
             self.prefix_cache = None
diff --git a/server/text_generation_server/models/seq2seq_lm.py b/server/text_generation_server/models/seq2seq_lm.py
@@ -165,12 +165,20 @@ def from_pb(
 
         # Mask out truncated tokens
         # (input_texts aren't truncated, only input_lengths are)
-        for i in truncate_indices:
-            input_length = input_lengths[i]
-            attention_mask[i, :-input_length] = 0
-            input_ids[i, :-input_length] = tokenizer.pad_token_id
-            if inputs_embeds is not None:
-                inputs_embeds[i, :-input_length, :] = 0
+        if truncate_indices:
+            for i in truncate_indices:
+                add_bos_token = getattr(tokenizer, "add_bos_token", False)
+                input_length = input_lengths[i]
+                attention_mask[i, :-input_length] = 0
+                input_ids[i, :-input_length] = tokenizer.pad_token_id
+                if add_bos_token:
+                    input_ids[i, -input_length] = tokenizer.bos_token_id
+                if inputs_embeds is not None:
+                    inputs_embeds[i, :-input_length, :] = 0
+                    if add_bos_token:
+                        p = encoder_prefix_ids.get(i)
+                        orig_length = input_length if p is None else input_length - p.shape[0]
+                        inputs_embeds[i, -orig_length] = prefix_cache.bos_embedding
 
         if decoder_prefix_ids:
             # Construct decoder embeddings and attention mask
diff --git a/server/text_generation_server/prompt_cache.py b/server/text_generation_server/prompt_cache.py
@@ -139,6 +139,7 @@ def move_node_to_head(self, cache_node: PromptCacheNode):
         cache_node.prev = None
         self.head = cache_node
 
+
 class PrefixCache:
     """Holds the cache of injectable prompts for a single model.
     """
@@ -149,15 +150,18 @@ def __init__(
         max_length: int,
         encoder_decoder: bool,
         decoder_start_tok_embedding: torch.Tensor,
+        bos_embedding: torch.Tensor,
     ):
         self.max_length = max_length
         self.embed_size = decoder_start_tok_embedding.shape[1] \
-            if decoder_start_tok_embedding is not None else None
+            if decoder_start_tok_embedding is not None else \
+            (bos_embedding.shape[1] if bos_embedding is not None else None)
         self.device: torch.device = device
         self.dtype = dtype
 
         self.is_encoder_decoder = encoder_decoder
         self.decoder_start_tok_embedding = decoder_start_tok_embedding
+        self.bos_embedding = bos_embedding
 
         self.cache_map: Dict[str, PromptCacheNode] = {}
         self.cache_dll: DoublyLinkedList = DoublyLinkedList()