Update to OpenVINO 2023.3, stateful model support

helena-intel · helena-intel · commit 3fc754ee35a9 · 2024-02-02T10:53:20.000-06:00
diff --git a/Dockerfile b/Dockerfile
@@ -310,7 +310,7 @@ RUN --mount=type=bind,from=auto-gptq-cache,src=/usr/src/auto-gptq-wheel,target=/
 # Install server
 COPY proto proto
 COPY server server
-# RUN --mount=type=cache,target=/root/.cache/pip cd server && make gen-server && pip install ".[accelerate, openvino]"
+# RUN --mount=type=cache,target=/root/.cache/pip cd server && make gen-server && pip install ".[accelerate, onnx-gpu, openvino, quantize]"
 RUN cd server && make gen-server && pip install ".[accelerate, onnx-gpu, openvino, quantize]" --no-cache-dir
 
 # Patch codegen model changes into transformers 4.35
diff --git a/server/poetry.lock b/server/poetry.lock
diff --git a/server/pyproject.toml b/server/pyproject.toml
@@ -23,6 +23,7 @@ datasets = { version = "^2.15.0", optional = true }
 texttable = { version = "^1.7.0", optional = true }
 transformers = "4.37.1"
 optimum = { version = "^1.16.2", extras = ["onnxruntime-gpu"], optional = true }
+optimum-intel = { version = ">=1.14.0", extras = ["openvino,nncf"], optional = true }
 onnxruntime = { version = "^1.16.3", optional = true }
 onnxruntime-gpu = { version = "^1.16.3", optional = true }
 onnx = { version = "^1.15.0", optional = true }
@@ -41,6 +42,7 @@ accelerate = ["accelerate"]
 bnb = ["bitsandbytes"]
 onnx = ["optimum", "onnxruntime", "onnx"]
 onnx-gpu = ["optimum", "onnxruntime-gpu", "onnx"]
+openvino = ["optimum-intel"]
 # These are only required if using the quantize cli command
 quantize = ["datasets", "texttable"]
 
diff --git a/server/text_generation_server/inference_engine/hf_optimum_ov.py b/server/text_generation_server/inference_engine/hf_optimum_ov.py
@@ -51,11 +51,13 @@ def __init__(
         dtype: torch.dtype,
         quantize: Optional[str],  # not used by OpenVINO
         model_config: Optional[Any],
+        max_sequence_length: Optional[int],
     ) -> None:
         super().__init__(model_path, model_config)
         print(f"Optimum Intel version: {__version__}")
         print(f"OpenVINO version: {get_version()}")
         print("model_path:", model_path)
+        os.environ["OPENVINO_LOG_LEVEL"] = "4"
 
         if model_class == AutoModelForCausalLM:
             model_class = OVModelForCausalLM
@@ -68,13 +70,13 @@ def __init__(
         if ov_config_file is not None:
             ov_config = json.loads(Path(ov_config_file).read_text())
         else:
-            ov_config = {"CACHE_DIR": ""}
+            ov_config = {}
 
         # Set good default options for latency-optimized workflow
         if "PERFORMANCE_HINT" not in ov_config:
             ov_config["PERFORMANCE_HINT"] = "LATENCY"
-        if "NUM_STREAMS" not in ov_config and "PERFORMANCE_HINT_NUM_REQUESTS" not in ov_config:
-            ov_config["PERFORMANCE_HINT_NUM_REQUESTS"] = 1
+        if "NUM_STREAMS" not in ov_config:
+            ov_config["NUM_STREAMS"] = 1
 
         print(f"ov_config: {ov_config}")
 
diff --git a/server/text_generation_server/models/causal_lm.py b/server/text_generation_server/models/causal_lm.py
@@ -571,16 +571,19 @@ def __init__(
             else:
                 self.tokenizer.add_special_tokens({"pad_token": "[PAD]"})
 
-        # Perform a forward pass to determine the structure of the past_key_values
-        one_token = torch.tensor([[1]], device=inference_engine.get_device())
-        _, past_key_values, _ = self.forward(input_ids=one_token, attention_mask=one_token)
-        if torch.is_tensor(past_key_values[0]):
-            self.batch_type = CombinedKVCausalLMBatch
+        if deployment_framework == "hf_optimum_ov" and self.model.stateful:
+            self.batch_type = CausalLMBatch
         else:
-            # check the ordering of the key tensor dimensions
-            key_past, value_past = past_key_values[0]
-            keys_head_dim_last = key_past.shape[-1] == value_past.shape[-1]
-            self.batch_type = CausalLMBatch if keys_head_dim_last else KeysDimTransposedCausalLMBatch
+            # Perform a forward pass to determine the structure of the past_key_values
+            one_token = torch.tensor([[1]], device=inference_engine.get_device())
+            _, past_key_values, _ = self.forward(input_ids=one_token, attention_mask=one_token)
+            if torch.is_tensor(past_key_values[0]):
+                self.batch_type = CombinedKVCausalLMBatch
+            else:
+                # check the ordering of the key tensor dimensions
+                key_past, value_past = past_key_values[0]
+                keys_head_dim_last = key_past.shape[-1] == value_past.shape[-1]
+                self.batch_type = CausalLMBatch if keys_head_dim_last else KeysDimTransposedCausalLMBatch
 
     @property
     def batch_type(self) -> Type[CausalLMBatch]: