Fixes to enable PT2C for ibm/mpt-7b-instruct2

tdoublep · njhill · commit 7bdcb4933fa0 · 2023-09-28T15:05:43.000-07:00
There were multiple issues using PT2C with this model: 1. PT2C fails with error `'NoneType' object has no attribute 'node'` when compiling the first kernel. This is an issue on PT side and I have opened [an issue](pytorch/pytorch#107721) accordingly. There is a simple workaround which is just to call forward once before compiling, so this does not block us for now. 2. There are issues using PT2C dynamic shapes together with the `einops` package. Fixed by updating einops to latest rc version. 3. The other models that we've tried with PT2C until now return the `past_key_values` (pkv) tensor as a tuple of tuples. The exception is when we concatenate a batch, after which the `past_key_values` tensors are a list of lists. Since type changes breaks the PT2C guards, we had some logic in the code to detect check whether the first dimension is a list, and if so convert the pkvs to a tuple of tuples. This logic breaks down for this model because the forward function returns the pkvs as a list of tuples, and starts erroring out if we try to pass them as a tuple of tuples. To solve, this I've added logic to detect what are the types expected by the model, and ensure that in the case of concatenation we always convert to the expected types. 4. There is one line in the modelling code that PT2C does not play well with. It seems to compare the shape of the `attention_mask` to the actual values inside and creates complete chaos leading to guards breaking whenever we concatenate batches that were started at different times. The solution is the trivial change below in the modelling code. Ideally we could get this change applied in the model on HF side, until then it can be patched in our local versions. Note that without this change in the model, everything still works, but we will be falling back to eager more often than if this change is applied.
diff --git a/server/poetry.lock b/server/poetry.lock
diff --git a/server/pyproject.toml b/server/pyproject.toml
@@ -23,7 +23,7 @@ optimum = { version = "1.11.0", extras = ["onnxruntime-gpu"], optional = true }
 onnxruntime = { version = "1.15.1", optional = true }
 onnxruntime-gpu = { version = "1.15.1", optional = true }
 onnx = { version = "1.14.0", optional = true }
-einops = "^0.6.1"
+einops = "^0.7.0rc2"
 
 # Explicitly install some transitive dependencies to avoid CVEs
 mpmath = ">=1.3.0"
diff --git a/server/text_generation_server/inference_engine/hf_transformers.py b/server/text_generation_server/inference_engine/hf_transformers.py
@@ -23,6 +23,10 @@ def __init__(
             "trust_remote_code": TRUST_REMOTE_CODE,
         }
 
+        if model_config.model_type == "mpt":
+            model_config.init_device = str(self.device)
+            kwargs["config"] = model_config
+
         if dtype == torch.int8:
             # using LLM.int8()
             kwargs["load_in_8bit"] = True
diff --git a/server/text_generation_server/models/causal_lm.py b/server/text_generation_server/models/causal_lm.py
@@ -481,6 +481,14 @@ def batch_type(self) -> Type[CausalLMBatch]:
     def batch_type(self, value):
         self._batch_type = value
 
+    def determine_pkv_types(self) -> Tuple[Type, Type]:
+        one_token = torch.tensor([[1]], device=self.device)
+        _, pkv, _ = self.forward(
+            input_ids=one_token,
+            attention_mask=one_token,
+        )
+        return type(pkv), type(pkv[0])
+
     def forward(
         self,
         input_ids: torch.Tensor,
diff --git a/server/text_generation_server/models/model.py b/server/text_generation_server/models/model.py
@@ -27,6 +27,8 @@
 if PT2_COMPILE:
     import torch._dynamo
     from torch._inductor.compile_fx import compile_fx
+    from einops._torch_specific import allow_ops_in_compiled_graph
+    allow_ops_in_compiled_graph()
 
 
 class Model(ABC):
@@ -70,6 +72,12 @@ def __init__(self, engine: BaseInferenceEngine, dtype: torch.dtype):
         if not PT2_COMPILE:
             self.compiled = False
         else:
+
+            # Perform a forward pass using a single token. This serves 2 purposes:
+            # (1) work-around for PT2C issue #107721
+            # (2) determine types of past_key_value output
+            type_pkv_dim0, type_pkv_dim1 = self.determine_pkv_types()
+
             torch._dynamo.config.cache_size_limit = 512
             self.n_kernels = 0
 
@@ -93,8 +101,13 @@ def count_kernels(guard):
             run_forward = torch._dynamo.run(compiled_forward)
 
             def parse_kwargs(kwargs):
-                if "past_key_values" in kwargs and type(kwargs["past_key_values"]) is list:
-                    kwargs["past_key_values"] = tuple(tuple(t) for t in kwargs["past_key_values"])
+                # after batch concatentation the past_key_value tensor is a list of lists.
+                # this will lead to guard failures unless we convert them to the typical
+                # types that we expect to be returned by forward.
+                pkv = kwargs.get("past_key_values")
+                if pkv is not None:
+                    if type(pkv) != type_pkv_dim0 or type(pkv[0]) != type_pkv_dim1:
+                        kwargs["past_key_values"] = type_pkv_dim0(type_pkv_dim1(t) for t in pkv)
                 return kwargs
 
             def override_forward_with_compile(self, *args, **kwargs):
diff --git a/server/text_generation_server/models/seq2seq_lm.py b/server/text_generation_server/models/seq2seq_lm.py
@@ -510,6 +510,15 @@ def batch_type(self) -> Type[Seq2SeqLMBatch]:
     def batch_type(self, value):
         self._batch_type = value
 
+    def determine_pkv_types(self) -> Tuple[Type, Type]:
+        one_token = torch.tensor([[1]], device=self.device)
+        _, _, pkv, _ = self.forward(
+            input_ids=one_token,
+            attention_mask=one_token,
+            decoder_input_ids=one_token,
+        )
+        return type(pkv), type(pkv[0])
+
     def forward(
         self,
         input_ids: torch.Tensor,