feat: implement support for prompt prefixes for flash_casual_lm and stantacoder

tjohnson31415 · njhill · commit 68b61db4108a · 2023-09-28T15:12:25.000-07:00
- add support for prompt embedding injection in flash_causal_lm.py
- add inputs_embeds support to the Flash Santacoder custom modeling code
- modify the discovery of the embedding layer from the model to make it work for GPTBigCode models

Signed-off-by: Travis Johnson &lt;tsjohnso@us.ibm.com&gt;
diff --git a/server/text_generation_server/models/custom_modeling/flash_santacoder_modeling.py b/server/text_generation_server/models/custom_modeling/flash_santacoder_modeling.py
@@ -327,10 +327,20 @@ def forward(
         cu_seqlens,
         cu_seqlens_q,
         max_s,
+        inputs_embeds: Optional[torch.Tensor] = None,
         past_key_values: Optional[torch.Tensor] = None,
         pre_allocate_past_size: Optional[int] = None,
     ):
-        hidden_states = self.wte(input_ids) + self.wpe(position_ids)
+        if input_ids is not None and inputs_embeds is not None:
+            raise ValueError(
+                "You cannot specify both input_ids and inputs_embeds at the same time"
+            )
+
+        if inputs_embeds is not None:
+            hidden_states = inputs_embeds + self.wpe(position_ids)
+            # TODO: support TP for the position embeddings
+        else:
+            hidden_states = self.wte(input_ids) + self.wpe(position_ids)
 
         if self.process_group.size() > 1:
             torch.distributed.all_reduce(hidden_states, group=self.process_group)
@@ -396,6 +406,7 @@ def forward(
         cu_seqlens,
         cu_seqlens_q,
         max_s,
+        inputs_embeds: Optional[torch.Tensor] = None,
         past_key_values: Optional[torch.Tensor] = None,
         pre_allocate_past_size: Optional[int] = None,
         lm_head_indices: Optional[torch.Tensor] = None,
@@ -406,6 +417,7 @@ def forward(
             cu_seqlens,
             cu_seqlens_q,
             max_s,
+            inputs_embeds,
             past_key_values,
             pre_allocate_past_size,
         )
diff --git a/server/text_generation_server/models/flash_causal_lm.py b/server/text_generation_server/models/flash_causal_lm.py
@@ -31,8 +31,12 @@ class FlashCausalLMBatch(Batch):
     requests: List[generate_pb2.Request]
 
     # Decoder values
+    # tensors have sequences from the batch concatenated
+    # shape is [sum(seq_lengths)]
     input_ids: torch.Tensor
     position_ids: torch.Tensor
+    # shape is [sum(seq_lengths), embedding_size]
+    inputs_embeds: torch.Tensor
     # cumulative sequence lengths
     cu_seqlens: torch.Tensor
     # cumulative query sequence lengths, only used in decode
@@ -68,77 +72,97 @@ def from_pb(
     ) -> Tuple[Optional["FlashCausalLMBatch"], List[GenerateError]]:
         errors = []
         batch_inputs = []
+        requests = pb.requests
+
+        # track indices of valid requests that have prefixes
+        i = 0
+        prefix_ids = {}
+        # compute sequence lengths in this loop too
+        #  if there is a prefix, input_lengths will include its length
+        input_lengths = []
         max_seqlen = 0
-        for r in pb.requests:
+        # Cumulative length
+        cu_seqlens = [0]
+        cumulative_length = 0
+        for r in requests:
+            input_length = r.input_length
+            # TODO: Also fail depending on the model type for ones that don't
+            # have input_embeds implemented?
             if r.prefix_id:
-                message = f"Prompt prefixes not yet supported with flash attention (request #{r.id})"
-                logging.error(message)
-                # Exclude this request from the batch, return an error
-                errors.append(GenerateError(request_id=r.id, message=message))
-                continue
+                try:
+                    prefix_embeds = prefix_cache.get(r.prefix_id)
+                except Exception:
+                    message = f"Prefix lookup error for request #{r.id}, prefix id {r.prefix_id}"
+                    logging.error(message)
+                    # Exclude this request from the batch, return an error
+                    errors.append(GenerateError(request_id=r.id, message=message))
+                    continue
+                prefix_ids[i] = prefix_embeds
+                input_length += prefix_embeds.shape[0]
             batch_inputs.append(r.inputs)
-            max_seqlen = max(max_seqlen, r.input_length)
+            input_lengths.append(input_length)
+            max_seqlen = max(max_seqlen, input_length)
+            cumulative_length += input_length
+            cu_seqlens.append(cumulative_length)
+            i += 1
 
+        # remove errored requests
         if errors:
             requests = [r for r in pb.requests if not any(r.id == er.request_id for er in errors)]
+            # early exit if no requests are valid
             if not requests:
                 return None, errors
 
+        # return as lists to avoid unnecessary padding;
+        # sequences will be concatenated across the batch
         batch_tokenized_inputs = tokenizer(
             batch_inputs, truncation=True, max_length=max_seqlen, return_token_type_ids=False
         )["input_ids"]
 
+        # Process inputs to generate the needed tensors
         input_ids = []
         position_ids = []
-        cu_seqlens = [0]
-
-        input_lengths = []
         all_input_ids_tensor = []
-
         next_token_choosers = []
-
-        # Cumulative length
-        cumulative_length = 0
-
-        # Parse batch
-        requests = pb.requests
-        for r, tokenized_input in zip(requests, batch_tokenized_inputs):
-            input_length = r.input_length
-
-            tokenized_input = tokenized_input[-input_length:]
-
-            # Fill in bos token in truncation case if needed
-            if r.truncate and getattr(tokenizer, "add_bos_token", False):
-                tokenized_input[0] = tokenizer.bos_token_id
-
-            input_lengths.append(input_length)
-
+        for r, tokenized_input, input_length in zip(requests, batch_tokenized_inputs, input_lengths):
+            if r.truncate:
+                tokenized_input = tokenized_input[-r.input_length:]
+                # Fill in bos token in truncation case if needed
+                if getattr(tokenizer, "add_bos_token", False):
+                    tokenized_input[0] = tokenizer.bos_token_id
             tokenized_input = torch.tensor(tokenized_input, device=device)
-            input_ids.append(tokenized_input)
-
-            # Position ids
-            position_ids.append(torch.arange(0, input_length, dtype=torch.int32))
-
-            # Add cumulative lengths of all previous inputs
-            cu_seqlens.append(cumulative_length + input_length)
-
+            # LHS pad for prefix, if it exists; RHS pad to max output
+            padded_input_ids = F.pad(tokenized_input, (input_length - r.input_length, r.max_output_length))
+            all_input_ids_tensor.append(padded_input_ids)
+            # input_ids needs prefix padding but not output padding
+            input_ids.append(tokenized_input if input_length == r.input_length else padded_input_ids[:input_length])
             next_token_choosers.append(
                 NextTokenChooser.from_pb(r.parameters, r.details.logprobs, tokenizer, device)
             )
-            all_input_ids_tensor.append(F.pad(tokenized_input, (0, r.max_output_length)))
-
-            cumulative_length += input_length
-
+            position_ids.append(torch.arange(0, input_length, dtype=torch.int32))
         input_ids = torch.cat(input_ids)
-        position_ids = torch.cat(position_ids).to(device, non_blocking=True)
-        cu_seqlens = torch.tensor(cu_seqlens, dtype=torch.int32, device=device)
+
+        # convert all requests to embeddings if any request has a prefix_id
+        if prefix_ids:
+            # TODO: Handle TP distributed embeddings layer
+            inputs_embeds = embeddings_lookup(input_ids)
+            input_ids = None
+            # fill in the prefix embeddings into the space that we already
+            # allocated due to the padding in input_ids
+            for i, p in prefix_ids.items():
+                start = cu_seqlens[i]
+                prefix_length = p.shape[0]
+                inputs_embeds[start:start+prefix_length, :] = p
+        else:
+            inputs_embeds = None
 
         return cls(
             batch_id=pb.id,
             requests=requests,
             input_ids=input_ids,
-            position_ids=position_ids,
-            cu_seqlens=cu_seqlens,
+            inputs_embeds=inputs_embeds,
+            position_ids=torch.cat(position_ids).to(device, non_blocking=True),
+            cu_seqlens=torch.tensor(cu_seqlens, dtype=torch.int32, device=device),
             cu_seqlens_q=None,
             max_seqlen=max_seqlen,
             past_key_values=None,
@@ -195,6 +219,7 @@ def concatenate(cls, batches: List["FlashCausalLMBatch"]) -> "FlashCausalLMBatch
             batch_id=batches[0].batch_id,
             requests=requests,
             input_ids=torch.cat(input_ids),
+            inputs_embeds=None,
             position_ids=torch.cat(position_ids),
             cu_seqlens=torch.cat(cu_seqlens),
             cu_seqlens_q=torch.arange(len(requests) + 1, device=device, dtype=torch.int32),
@@ -345,6 +370,7 @@ def generate_token(
             batch.cu_seqlens,
             batch.cu_seqlens_q,
             batch.max_seqlen,
+            batch.inputs_embeds,
             past_key_values,
             prealloc_length,
         )
@@ -410,6 +436,7 @@ def _process_prefill(
         # Create final next batch tensors
         batch.input_ids = torch.cat(next_batch_input_ids) \
             if batch_size > 1 else next_batch_input_ids[0].view(1)
+        batch.inputs_embeds = None
 
         batch.cu_seqlens_q = torch.arange(
             batch_size + 1, device=self.device, dtype=torch.int32
diff --git a/server/text_generation_server/models/model.py b/server/text_generation_server/models/model.py
@@ -152,6 +152,11 @@ def get_indices_to_keep(
         return next_batch_keep_indices
 
     def _setup_prompt_encoder(self) -> bool:
+        # this is the most common name for the word embedding module for transformers models
+        if hasattr(self.model, 'transformer') and hasattr(self.model.transformer, 'wte'):
+            self.word_embeddings = self.model.transformer.wte
+            return True
+
         vocab_size = getattr(self.model.config, "vocab_size", None)
 
         if vocab_size is not None and hasattr(self.model, "named_children"):