Reduce PT2C Warmup time

jvlunteren · tdoublep · njhill · commit a0944880ca48 · 2024-01-12T14:44:15.000-08:00
This PR targets an issue created by Tom Parnell with the following description:

"Currently in the PT2C warmup logic we essentially perform warmup twice, one with as_concat=False and again with as_concat=True. It was implemented this way because we see some differences between "normal" batches and batches that were created from concatenation. The warmup logic essentially tries to cover both of these two cases.

Specifically, the differences between normal batches and post-concat baches are as follows:

1. Post-concat batches always have contiguous PKV tensors, whereas "normal" batches have contiguous PKV tensors almost all of the time but very occasionally (e..g, after very first token is generated) have non-contiguous PKV tensors.
2. Post-concat batches contain the decoder_attention_mask tensor (for encoder-decoder models) whereas for normal batches it is set to None.

The issue relates to the following work: can we make some small code changes to essentially regularize these two cases?

Since the PKV tensors are only rarely non-contiguous, can't we just force them to be contiguous before calling forward? There is some latency penalty to doing this but since most of the time it is not needed, we might be ok.
Can be also define the decoder_attention_mask for "normal" batches. Again, perhaps there is some small latency overhead from this which needs to be evaluated.
These changes may incur a potential latency cost but will have the benefit of halving the warmup time. The work here is to (a) implement these changes and (b) verify that the latency overhead is minimal."

The update involves the required small code changes as described above.

Co-authored-by: Thomas Parnell &lt;tpa@zurich.ibm.com&gt;
diff --git a/server/text_generation_server/models/__init__.py b/server/text_generation_server/models/__init__.py
@@ -3,7 +3,7 @@
 
 import torch
 
-from text_generation_server.models.model import Model
+from text_generation_server.models.model import Model, PT2_COMPILE
 from transformers.models.auto import modeling_auto
 
 from text_generation_server.models.causal_lm import CausalLM
@@ -14,7 +14,7 @@
 
 FLASH_ATTENTION = os.getenv("FLASH_ATTENTION", "false").lower() == "true"
 
-__all__ = ["Model", "CausalLM", "Seq2SeqLM", "get_model", "FLASH_ATTENTION"]
+__all__ = ["Model", "CausalLM", "Seq2SeqLM", "get_model", "FLASH_ATTENTION", "PT2_COMPILE"]
 
 # The flag below controls whether to allow TF32 on matmul. This flag defaults to False
 # in PyTorch 1.12 and later.
diff --git a/server/text_generation_server/models/model.py b/server/text_generation_server/models/model.py
@@ -103,6 +103,13 @@ def parse_kwargs(kwargs):
                 if pkv is not None:
                     if type(pkv) != type_pkv_dim0 or type(pkv[0]) != type_pkv_dim1:
                         kwargs["past_key_values"] = type_pkv_dim0(type_pkv_dim1(t) for t in pkv)
+
+                    for t in pkv:
+                        for x in t:
+                            strides = list(x.stride())
+                            if strides != sorted(strides, reverse=True):
+                                x.data = x.data.clone(memory_format=torch.contiguous_format)
+
                 return kwargs
 
             def override_forward_with_compile(self, *args, **kwargs):
@@ -113,7 +120,6 @@ def override_forward_with_run(self, *args, **kwargs):
                 kwargs = parse_kwargs(kwargs)
                 return run_forward(*args, **kwargs)
 
-            self.compiled = True
             self.model.forward = types.MethodType(override_forward_with_compile, self.model)
             self.model.run_forward = types.MethodType(override_forward_with_run, self.model)
 
diff --git a/server/text_generation_server/models/seq2seq_lm.py b/server/text_generation_server/models/seq2seq_lm.py
@@ -10,7 +10,7 @@
 
 from transformers.modeling_outputs import BaseModelOutput
 
-from text_generation_server.models.model import Model, CUDA_PAD_TO_MULT_OF_8
+from text_generation_server.models.model import Model, CUDA_PAD_TO_MULT_OF_8, PT2_COMPILE
 from text_generation_server.models.types import Batch, GenerateError
 from text_generation_server.pb import generate_pb2
 from text_generation_server.prompt_cache import PrefixCache
@@ -207,7 +207,14 @@ def from_pb(
             decoder_input_ids[:, -1] = tokenizer.bos_token_id
         else:
             decoder_inputs_embeds = None
-            decoder_attention_mask = None
+            if PT2_COMPILE:
+                decoder_attention_mask = attention_mask.new_zeros(
+                    batch_size, max_decoder_input_length + padding_right_offset
+                )
+                decoder_attention_mask[:, 0] = 1
+            else:
+                decoder_attention_mask = None
+
             # Each decoder sequence only contains the bos_token
             # so decoder_input_ids is a torch tensor of size [batch_size, 1]
             decoder_input_ids = input_ids.new_full((batch_size, 1), tokenizer.bos_token_id)
diff --git a/server/text_generation_server/server.py b/server/text_generation_server/server.py
@@ -15,7 +15,7 @@
 from typing import List, Optional
 
 from text_generation_server.cache import Cache
-from text_generation_server.models import Model, get_model, Seq2SeqLM
+from text_generation_server.models import Model, get_model, Seq2SeqLM, PT2_COMPILE
 from text_generation_server.models.flash_causal_lm import FlashCausalLM
 from text_generation_server.pb import generate_pb2_grpc, generate_pb2
 from text_generation_server.pb.generate_pb2 import ModelInfoResponse
@@ -305,7 +305,7 @@ async def serve_inner(
                 t = threading.Thread(target=partial(log_gpu_stats, device, interval))
                 t.start()
 
-        if model.compiled:
+        if PT2_COMPILE:
             # trigger pt2 compile for variety of tensor shapes
             print("Warming up PyTorch 2 compile...")
             warmup_t0 = time.time()
diff --git a/server/text_generation_server/utils/warmup.py b/server/text_generation_server/utils/warmup.py
@@ -39,10 +39,10 @@ def __force_contiguous(x):
             x.data = x.data.contiguous(memory_format=torch.channels_last).contiguous()
         return x
 
-    def __eval_shape(batch_size: int, sequence_length: int, num_new_tokens: int, as_concat: bool = False):
+    def __eval_shape(batch_size: int, sequence_length: int, num_new_tokens: int):
 
         if verbose:
-            print(">> evaluating shape (batch_size: %d, sequence_length: %d, num_new_tokens: %d), as_concat: %d" % (batch_size, sequence_length, num_new_tokens, as_concat))
+            print(">> evaluating shape (batch_size: %d, sequence_length: %d, num_new_tokens: %d)" % (batch_size, sequence_length, num_new_tokens))
 
         input_length = sequence_length - num_new_tokens
 
@@ -59,27 +59,16 @@ def __eval_shape(batch_size: int, sequence_length: int, num_new_tokens: int, as_
             use_position_ids=model.use_position_ids,
         )
 
-        if as_concat and has_decoder_attention_mask:
-            batch.decoder_attention_mask = batch.attention_mask.new_zeros(
-                batch_size,
-                batch.max_decoder_input_length + batch.padding_right_offset
-            )
-            batch.decoder_attention_mask[:, 0:-batch.padding_right_offset] = 1
-
         model.generate_token(
             batch, first=True, for_concat=False,
         )
 
         for i in range(num_new_tokens-1):
-
-            if as_concat:
-                batch.past_key_values = tuple(tuple(__force_contiguous(t) for t in layer) for layer in batch.past_key_values)
-
             model.generate_token(batch)
 
-    def __safe_eval_shape(batch_size: int, sequence_length: int, num_new_tokens: int, as_concat: bool = False):
+    def __safe_eval_shape(batch_size: int, sequence_length: int, num_new_tokens: int):
         try:
-            __eval_shape(batch_size, sequence_length, num_new_tokens, as_concat)
+            __eval_shape(batch_size, sequence_length, num_new_tokens)
         except Exception as e:
             print(">> caught exception: ", e)
 
@@ -101,30 +90,26 @@ def __max_new_tokens_for_sequence_length(sequence_length: int):
     if verbose:
         print("[Phase 1] Probing boundaries.")
 
-    for as_concat in [True, False]:
-        for batch_size in [1, max_batch_size]:
-            max_sequence_length_for_batch_size = __max_sequence_length_for_batch_size(batch_size)
-            for sequence_length in [2, 3, max_sequence_length_for_batch_size]:
+    for batch_size in [1, max_batch_size]:
+        max_sequence_length_for_batch_size = __max_sequence_length_for_batch_size(batch_size)
+        for sequence_length in [2, 3, max_sequence_length_for_batch_size]:
+            __safe_eval_shape(
+                batch_size=batch_size,
+                sequence_length=sequence_length,
+                num_new_tokens=1,
+            )
+            if sequence_length > 2:
+                __safe_eval_shape(
+                    batch_size=batch_size,
+                    sequence_length=sequence_length,
+                    num_new_tokens=2,
+                )
+            if sequence_length > 3:
                 __safe_eval_shape(
                     batch_size=batch_size,
                     sequence_length=sequence_length,
-                    num_new_tokens=1,
-                    as_concat=as_concat,
+                    num_new_tokens=__max_new_tokens_for_sequence_length(sequence_length),
                 )
-                if sequence_length > 2:
-                    __safe_eval_shape(
-                        batch_size=batch_size,
-                        sequence_length=sequence_length,
-                        num_new_tokens=2,
-                        as_concat=as_concat,
-                    )
-                if sequence_length > 3:
-                    __safe_eval_shape(
-                        batch_size=batch_size,
-                        sequence_length=sequence_length,
-                        num_new_tokens=__max_new_tokens_for_sequence_length(sequence_length),
-                        as_concat=as_concat,
-                    )
 
     if verbose:
         print("[Phase 2] Probing random valid tensor shapes.")
@@ -142,12 +127,10 @@ def __max_new_tokens_for_sequence_length(sequence_length: int):
     rs = np.random.RandomState(seed=42)
     for i in range(n_samples):
         shape = valid_shapes[rs.randint(low=0, high=len(valid_shapes))]
-        as_concat = rs.choice([True, False])
         __safe_eval_shape(
             batch_size=shape[0],
             sequence_length=shape[1],
             num_new_tokens=shape[2],
-            as_concat=as_concat,
         )
         if verbose:
             print(">> n_samples: %3d, n_new_compiles: %3d" % (i+1, model.n_kernels-n_compiles))