Fix the support for input_embeds in santacoder in sharded mode

maxdebayser · njhill · commit 2f79208f0a1e · 2024-01-12T14:51:16.000-08:00
* Remove default argument
* Activate multi-shard input_embeds test cases
* Change treatment of input_embeds in flash_santacoder

Instead of dividing the prefix embeddings by the world size in all
shards, only the rank 0 shard will return a non-zero tensor, thus
preserving the semantics of the all_reduce operation but without
the potential loss of precision that a floating point division entails.
diff --git a/integration_tests/test_cases_bloom560m.yaml b/integration_tests/test_cases_bloom560m.yaml
@@ -42,8 +42,6 @@
 
 # Prompt prefix
 - name: Greedy with tuned prompt prefix
-  # Prompt prefixes with multi-shard not yet supported
-  singleShardOnly: true
   request:
     prefixId: bloom_sentiment_1
     params:
@@ -59,8 +57,6 @@
         text: ' positive'
 
 - name: Greedy with tuned prompt prefix and truncation
-  # Prompt prefixes with multi-shard not yet supported
-  singleShardOnly: true
   request:
     prefixId: bloom_sentiment_1
     params:
@@ -79,8 +75,6 @@
 
 # Prompt prefix with nested path
 - name: Greedy with tuned prompt prefix with nested path (id)
-  # Prompt prefixes with multi-shard not yet supported
-  singleShardOnly: true
   request:
     prefixId: nested/path
     params:
@@ -98,8 +92,6 @@
 
 # Prompt prefix returning input and generated tokens
 - name: Greedy with tuned prompt prefix and returned tokens
-  # Prompt prefixes with multi-shard not yet supported
-  singleShardOnly: true
   request:
     prefixId: bloom_sentiment_1
     params:
@@ -279,8 +271,6 @@
 
 # Error case - invalid prefix id
 - name: Error case - invalid prefix id
-  # Prompt prefixes with multi-shard not yet supported
-  singleShardOnly: true
   request:
     prefixId: invalid_prefix_id
     params:
diff --git a/integration_tests/test_cases_tinyllama.yaml b/integration_tests/test_cases_tinyllama.yaml
@@ -160,8 +160,6 @@
 
 # Prompt prefix
 - name: Greedy with tuned prompt prefix
-  # Prompt prefixes with multi-shard not yet supported
-  singleShardOnly: true
   request:
     prefixId: tinyllama
     params:
@@ -179,8 +177,6 @@
 
 # Prompt prefix with truncation
 - name: Greedy with tuned prompt prefix with truncation
-  # Prompt prefixes with multi-shard not yet supported
-  singleShardOnly: true
   request:
     prefixId: tinyllama
     params:
@@ -201,8 +197,6 @@
 
 # Prompt prefix returning input and generated tokens
 - name: Greedy with tuned prompt prefix and returned tokens
-  # Prompt prefixes with multi-shard not yet supported
-  singleShardOnly: true
   request:
     prefixId: tinyllama
     params:
diff --git a/integration_tests/test_cases_tinystarcoderpy.yaml b/integration_tests/test_cases_tinystarcoderpy.yaml
@@ -172,8 +172,6 @@
 
 # Prompt prefix
 - name: Greedy with tuned prompt prefix
-  # Prompt prefixes with multi-shard not yet supported
-  singleShardOnly: true
   request:
     # Prefix is "def hello_world():\n"
     prefixId: tiny_starcoder
@@ -189,8 +187,6 @@
         text: "(\"Hello World!\")\n\nhello_world()\n"
 
 - name: Greedy with tuned prompt prefix and truncation
-  # Prompt prefixes with multi-shard not yet supported
-  singleShardOnly: true
   request:
     # Prefix is "def hello_world():\n"
     prefixId: tiny_starcoder
@@ -209,8 +205,6 @@
 
 # Prompt prefix returning input and generated tokens
 - name: Greedy with tuned prompt prefix and returned tokens
-  # Prompt prefixes with multi-shard not yet supported
-  singleShardOnly: true
   request:
     # Prefix is "def hello_world():\n"
     prefixId: tiny_starcoder
diff --git a/integration_tests/text_generation_tests/test_server.py b/integration_tests/text_generation_tests/test_server.py
@@ -396,7 +396,7 @@ async def test_mt0(server_fixture, test_cases):
 # test with tiny GPTBigCode model for the merged kv cache
 @pytest.mark.model("bigcode/tiny_starcoder_py")
 @pytest.mark.extensions(".safetensors,.json")
-@pytest.mark.shards(1)
+@pytest.mark.shards(2)
 @pytest.mark.test_case_file("test_cases_tinystarcoderpy.yaml")
 @pytest.mark.asyncio
 async def test_gptbigcode(server_fixture, test_cases):
@@ -405,7 +405,7 @@ async def test_gptbigcode(server_fixture, test_cases):
 # test with Llama model which has tokenizer.add_bos_token == true
 @pytest.mark.model("Maykeye/TinyLLama-v0")
 @pytest.mark.extensions(".bin,.json,.model")
-@pytest.mark.shards(1)
+@pytest.mark.shards(2)
 @pytest.mark.test_case_file("test_cases_tinyllama.yaml")
 @pytest.mark.asyncio
 async def test_llama(server_fixture, test_cases):
diff --git a/server/text_generation_server/models/custom_modeling/flash_santacoder_modeling.py b/server/text_generation_server/models/custom_modeling/flash_santacoder_modeling.py
@@ -409,10 +409,9 @@ def forward(
             raise ValueError(
                 "You cannot specify both input_ids and inputs_embeds at the same time"
             )
-
+        
         if inputs_embeds is not None:
             hidden_states = inputs_embeds + self.wpe(position_ids)
-            # TODO: support TP for the position embeddings
         else:
             hidden_states = self.wte(input_ids) + self.wpe(position_ids)
 
diff --git a/server/text_generation_server/models/flash_causal_lm.py b/server/text_generation_server/models/flash_causal_lm.py
@@ -156,7 +156,6 @@ def from_pb(
 
         # convert all requests to embeddings if any request has a prefix_id
         if prefix_ids:
-            # TODO: Handle TP distributed embeddings layer
             inputs_embeds = embeddings_lookup(input_ids)
             input_ids = None
             # fill in the prefix embeddings into the space that we already
diff --git a/server/text_generation_server/models/model.py b/server/text_generation_server/models/model.py
@@ -53,11 +53,21 @@ def __init__(self, engine: BaseInferenceEngine, dtype: torch.dtype):
             decoder_start_token_id = self.model.config.decoder_start_token_id
             if decoder_start_token_id is None:
                 decoder_start_token_id = self.tokenizer.bos_token_id
+
+            return_zero = False
+            # If the word_embeddings layer is configured not to reduce at the end of the forward() call
+            # each shard will have only a partial tensor. This tensor cannot be concatenated with a
+            # prefix tensor in each shard because the reduce that happens afterwards would result
+            # in adding the prefix N times, where N is the world size.
+            if isinstance(self.word_embeddings, TensorParallelEmbedding) and not self.word_embeddings.reduce:
+                return_zero = self.word_embeddings.process_group.rank() != 0
+
             self.prefix_cache = PrefixCache(
                 device=self.device,
                 dtype=dtype,
                 max_length=MAX_PROMPT_PREFIX_LENGTH,
                 encoder_decoder=self.model.config.is_encoder_decoder,
+                return_zero=return_zero,
                 decoder_start_tok_embedding=self.word_embeddings(
                     torch.tensor([decoder_start_token_id], device=self.device, dtype=torch.long)
                 ) if decoder_start_token_id is not None else None,
diff --git a/server/text_generation_server/prompt_cache.py b/server/text_generation_server/prompt_cache.py
@@ -4,7 +4,7 @@
 from pathlib import Path
 import re
 import threading
-from typing import Dict, List, Union, Tuple
+from typing import Dict, List, Union, Tuple, Optional
 
 import torch
 
@@ -149,6 +149,7 @@ def __init__(
         dtype: torch.dtype,
         max_length: int,
         encoder_decoder: bool,
+        return_zero: Optional[bool],
         decoder_start_tok_embedding: torch.Tensor,
     ):
         self.max_length = max_length
@@ -158,6 +159,7 @@ def __init__(
         self.dtype = dtype
 
         self.is_encoder_decoder = encoder_decoder
+        self.zero = torch.zeros((1,), dtype=dtype, device=device) if return_zero else None
         self.decoder_start_tok_embedding = decoder_start_tok_embedding
 
         self.cache_map: Dict[str, PromptCacheNode] = {}
@@ -210,23 +212,34 @@ def _load_embedding_tensors(self, prefix_id: str) -> Union[torch.Tensor, Tuple[t
         decoder_prefix = self._load_embedding_tensor(prefix_id, "decoder.pt", dtype=self.dtype)
         # For encoder-decoder we store a tuple of (encoder_prefix, decoder_prefix),
         # at least one must be non-None
+        if decoder_prefix is not None:
+            if self.zero is not None:
+                decoder_prefix = self.zero.expand(decoder_prefix.shape)
+            else:
+                decoder_prefix = decoder_prefix.to(self.dtype).to(self.device, non_blocking=True)
+
         if self.is_encoder_decoder:
             encoder_prefix = self._load_embedding_tensor(prefix_id, "encoder.pt", dtype=self.dtype)
             if decoder_prefix is None:
                 if encoder_prefix is None:
                     raise PrefixNotFound(f"Prefix id {prefix_id} not found")
             else:
-                decoder_prefix = decoder_prefix.to(self.device, non_blocking=True)
                 # TODO confirm this cat is correct
-                decoder_prefix = torch.cat((decoder_prefix, self.decoder_start_tok_embedding))
+                if self.zero is not None:
+                    decoder_prefix = self.zero.expand(decoder_prefix.shape[0] + 1, *decoder_prefix.shape[1:])
+                else:
+                    decoder_prefix = torch.cat((decoder_prefix, self.decoder_start_tok_embedding))
             if encoder_prefix is not None:
-                encoder_prefix = encoder_prefix.to(self.device, non_blocking=True)
+                if self.zero is not None:
+                    encoder_prefix = self.zero.expand(encoder_prefix.shape)
+                else:    
+                    encoder_prefix = encoder_prefix.to(self.device, non_blocking=True)
             prefix = encoder_prefix, decoder_prefix
         # For decoder-only we store just the decoder prefix
         elif decoder_prefix is None:
             raise PrefixNotFound(f"Prefix id {prefix_id} not found")
         else:
-            prefix = decoder_prefix.to(self.dtype).to(self.device, non_blocking=True)
+            prefix = decoder_prefix
         return prefix
 
     def _load_embedding_tensor(self, prefix_id: str, filename: str, dtype: torch.dtype) -> torch.Tensor: