Skip to content

Commit 3fc754e

Browse files
committed
Update to OpenVINO 2023.3, stateful model support
1 parent 16fc318 commit 3fc754e

File tree

5 files changed

+51
-14
lines changed

5 files changed

+51
-14
lines changed

Dockerfile

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -310,7 +310,7 @@ RUN --mount=type=bind,from=auto-gptq-cache,src=/usr/src/auto-gptq-wheel,target=/
310310
# Install server
311311
COPY proto proto
312312
COPY server server
313-
# RUN --mount=type=cache,target=/root/.cache/pip cd server && make gen-server && pip install ".[accelerate, openvino]"
313+
# RUN --mount=type=cache,target=/root/.cache/pip cd server && make gen-server && pip install ".[accelerate, onnx-gpu, openvino, quantize]"
314314
RUN cd server && make gen-server && pip install ".[accelerate, onnx-gpu, openvino, quantize]" --no-cache-dir
315315

316316
# Patch codegen model changes into transformers 4.35

server/poetry.lock

Lines changed: 31 additions & 1 deletion
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

server/pyproject.toml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@ datasets = { version = "^2.15.0", optional = true }
2323
texttable = { version = "^1.7.0", optional = true }
2424
transformers = "4.37.1"
2525
optimum = { version = "^1.16.2", extras = ["onnxruntime-gpu"], optional = true }
26+
optimum-intel = { version = ">=1.14.0", extras = ["openvino,nncf"], optional = true }
2627
onnxruntime = { version = "^1.16.3", optional = true }
2728
onnxruntime-gpu = { version = "^1.16.3", optional = true }
2829
onnx = { version = "^1.15.0", optional = true }
@@ -41,6 +42,7 @@ accelerate = ["accelerate"]
4142
bnb = ["bitsandbytes"]
4243
onnx = ["optimum", "onnxruntime", "onnx"]
4344
onnx-gpu = ["optimum", "onnxruntime-gpu", "onnx"]
45+
openvino = ["optimum-intel"]
4446
# These are only required if using the quantize cli command
4547
quantize = ["datasets", "texttable"]
4648

server/text_generation_server/inference_engine/hf_optimum_ov.py

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -51,11 +51,13 @@ def __init__(
5151
dtype: torch.dtype,
5252
quantize: Optional[str], # not used by OpenVINO
5353
model_config: Optional[Any],
54+
max_sequence_length: Optional[int],
5455
) -> None:
5556
super().__init__(model_path, model_config)
5657
print(f"Optimum Intel version: {__version__}")
5758
print(f"OpenVINO version: {get_version()}")
5859
print("model_path:", model_path)
60+
os.environ["OPENVINO_LOG_LEVEL"] = "4"
5961

6062
if model_class == AutoModelForCausalLM:
6163
model_class = OVModelForCausalLM
@@ -68,13 +70,13 @@ def __init__(
6870
if ov_config_file is not None:
6971
ov_config = json.loads(Path(ov_config_file).read_text())
7072
else:
71-
ov_config = {"CACHE_DIR": ""}
73+
ov_config = {}
7274

7375
# Set good default options for latency-optimized workflow
7476
if "PERFORMANCE_HINT" not in ov_config:
7577
ov_config["PERFORMANCE_HINT"] = "LATENCY"
76-
if "NUM_STREAMS" not in ov_config and "PERFORMANCE_HINT_NUM_REQUESTS" not in ov_config:
77-
ov_config["PERFORMANCE_HINT_NUM_REQUESTS"] = 1
78+
if "NUM_STREAMS" not in ov_config:
79+
ov_config["NUM_STREAMS"] = 1
7880

7981
print(f"ov_config: {ov_config}")
8082

server/text_generation_server/models/causal_lm.py

Lines changed: 12 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -571,16 +571,19 @@ def __init__(
571571
else:
572572
self.tokenizer.add_special_tokens({"pad_token": "[PAD]"})
573573

574-
# Perform a forward pass to determine the structure of the past_key_values
575-
one_token = torch.tensor([[1]], device=inference_engine.get_device())
576-
_, past_key_values, _ = self.forward(input_ids=one_token, attention_mask=one_token)
577-
if torch.is_tensor(past_key_values[0]):
578-
self.batch_type = CombinedKVCausalLMBatch
574+
if deployment_framework == "hf_optimum_ov" and self.model.stateful:
575+
self.batch_type = CausalLMBatch
579576
else:
580-
# check the ordering of the key tensor dimensions
581-
key_past, value_past = past_key_values[0]
582-
keys_head_dim_last = key_past.shape[-1] == value_past.shape[-1]
583-
self.batch_type = CausalLMBatch if keys_head_dim_last else KeysDimTransposedCausalLMBatch
577+
# Perform a forward pass to determine the structure of the past_key_values
578+
one_token = torch.tensor([[1]], device=inference_engine.get_device())
579+
_, past_key_values, _ = self.forward(input_ids=one_token, attention_mask=one_token)
580+
if torch.is_tensor(past_key_values[0]):
581+
self.batch_type = CombinedKVCausalLMBatch
582+
else:
583+
# check the ordering of the key tensor dimensions
584+
key_past, value_past = past_key_values[0]
585+
keys_head_dim_last = key_past.shape[-1] == value_past.shape[-1]
586+
self.batch_type = CausalLMBatch if keys_head_dim_last else KeysDimTransposedCausalLMBatch
584587

585588
@property
586589
def batch_type(self) -> Type[CausalLMBatch]:

0 commit comments

Comments
 (0)