From 83552d7941e7f70553b1cd88bdf3457ca51d73ce Mon Sep 17 00:00:00 2001
From: qinxuye <qinxuye@gmail.com>
Date: Thu, 20 Jun 2024 13:40:58 +0000
Subject: [PATCH] FEAT: Initially support OpenVINO

---
 xinference/model/llm/__init__.py      |   4 +
 xinference/model/llm/llm_family.py    |   2 +
 xinference/model/llm/openvino/core.py | 341 ++++++++++++++++++++++++++
 3 files changed, 347 insertions(+)
 create mode 100644 xinference/model/llm/openvino/core.py

diff --git a/xinference/model/llm/__init__.py b/xinference/model/llm/__init__.py
index fb56d82488..0ab7a4f3e1 100644
--- a/xinference/model/llm/__init__.py
+++ b/xinference/model/llm/__init__.py
@@ -34,6 +34,7 @@
     BUILTIN_MODELSCOPE_LLM_FAMILIES,
     LLAMA_CLASSES,
     LLM_ENGINES,
+    OPENVINO_CLASSES,
     SGLANG_CLASSES,
     SUPPORTED_ENGINES,
     TRANSFORMERS_CLASSES,
@@ -112,6 +113,7 @@ def generate_engine_config_by_model_family(model_family):
 def _install():
     from .ggml.chatglm import ChatglmCppChatModel
     from .ggml.llamacpp import LlamaCppChatModel, LlamaCppModel
+    from .openvino.core import OpenVINOChatModel, OpenVINOModel
     from .pytorch.baichuan import BaichuanPytorchChatModel
     from .pytorch.chatglm import ChatglmPytorchChatModel
     from .pytorch.cogvlm2 import CogVLM2Model
@@ -147,6 +149,7 @@ def _install():
     )
     SGLANG_CLASSES.extend([SGLANGModel, SGLANGChatModel])
     VLLM_CLASSES.extend([VLLMModel, VLLMChatModel])
+    OPENVINO_CLASSES.extend([OpenVINOModel, OpenVINOChatModel])
     TRANSFORMERS_CLASSES.extend(
         [
             BaichuanPytorchChatModel,
@@ -176,6 +179,7 @@ def _install():
     SUPPORTED_ENGINES["SGLang"] = SGLANG_CLASSES
     SUPPORTED_ENGINES["Transformers"] = TRANSFORMERS_CLASSES
     SUPPORTED_ENGINES["llama.cpp"] = LLAMA_CLASSES
+    SUPPORTED_ENGINES["OpenVINO"] = OPENVINO_CLASSES
 
     json_path = os.path.join(
         os.path.dirname(os.path.abspath(__file__)), "llm_family.json"
diff --git a/xinference/model/llm/llm_family.py b/xinference/model/llm/llm_family.py
index b3b3c8dbc5..321fc470c8 100644
--- a/xinference/model/llm/llm_family.py
+++ b/xinference/model/llm/llm_family.py
@@ -248,6 +248,8 @@ def parse_raw(
 
 VLLM_CLASSES: List[Type[LLM]] = []
 
+OPENVINO_CLASSES: List[Type[LLM]] = []
+
 LLM_ENGINES: Dict[str, Dict[str, List[Dict[str, Any]]]] = {}
 SUPPORTED_ENGINES: Dict[str, List[Type[LLM]]] = {}
 
diff --git a/xinference/model/llm/openvino/core.py b/xinference/model/llm/openvino/core.py
new file mode 100644
index 0000000000..3f21cc3082
--- /dev/null
+++ b/xinference/model/llm/openvino/core.py
@@ -0,0 +1,341 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import logging
+import os.path
+from typing import Dict, Iterable, Iterator, List, Optional, TypedDict, Union
+
+from ....types import (
+    ChatCompletion,
+    ChatCompletionChunk,
+    ChatCompletionMessage,
+    Completion,
+    CompletionChunk,
+    CreateCompletionTorch,
+    LoRA,
+)
+from ...utils import select_device
+from ..core import LLM
+from ..llm_family import LLMFamilyV1, LLMSpecV1
+from ..utils import ChatModelMixin
+
+logger = logging.getLogger(__name__)
+
+
+class OpenVINOModelConfig(TypedDict, total=False):
+    revision: Optional[str]
+    device: str
+    gpus: Optional[str]
+    num_gpus: int
+    max_gpu_memory: str
+    gptq_ckpt: Optional[str]
+    gptq_wbits: int
+    gptq_groupsize: int
+    gptq_act_order: bool
+    trust_remote_code: bool
+
+
+class OpenVINOGenerateConfig(TypedDict, total=False):
+    temperature: float
+    repetition_penalty: float
+    top_p: float
+    top_k: int
+    stream: bool
+    max_tokens: int
+    echo: bool
+    stop: Optional[Union[str, List[str]]]
+    stop_token_ids: Optional[Union[int, List[int]]]
+    stream_interval: int
+    model: Optional[str]
+    tools: Optional[List[Dict]]
+    lora_name: Optional[str]
+    stream_options: Optional[Union[dict, None]]
+    request_id: Optional[str]
+
+
+class OpenVINOModel(LLM):
+    def __init__(
+        self,
+        model_uid: str,
+        model_family: "LLMFamilyV1",
+        model_spec: "LLMSpecV1",
+        quantization: str,
+        model_path: str,
+        model_config: Optional[OpenVINOModelConfig] = None,
+        peft_model: Optional[List[LoRA]] = None,
+    ):
+        super().__init__(model_uid, model_family, model_spec, quantization, model_path)
+        self._use_fast_tokenizer = True
+        self._model_config: OpenVINOModelConfig = self._sanitize_model_config(
+            model_config
+        )
+        if peft_model is not None:
+            raise ValueError("OpenVINO engine has not supported lora yet")
+
+    def _sanitize_model_config(
+        self, model_config: Optional[OpenVINOModelConfig]
+    ) -> OpenVINOModelConfig:
+        if model_config is None:
+            model_config = OpenVINOModelConfig()
+        model_config.setdefault("revision", self.model_spec.model_revision)
+        model_config.setdefault("gptq_ckpt", None)
+        model_config.setdefault("gptq_wbits", 16)
+        model_config.setdefault("gptq_groupsize", -1)
+        model_config.setdefault("gptq_act_order", False)
+        model_config.setdefault("device", "auto")
+        model_config.setdefault("trust_remote_code", True)
+        return model_config
+
+    def _sanitize_generate_config(
+        self,
+        generate_config: Optional[OpenVINOGenerateConfig],
+    ) -> OpenVINOGenerateConfig:
+        if generate_config is None:
+            generate_config = OpenVINOGenerateConfig(**CreateCompletionTorch().dict())
+        else:
+            # Validate generate_config and fill default values to the generate config.
+            generate_config = OpenVINOGenerateConfig(
+                **CreateCompletionTorch(**generate_config).dict()
+            )
+        generate_config["model"] = self.model_uid
+        return generate_config
+
+    def _load_model(self, **kwargs):
+        try:
+            from optimum.intel import OVModelForCausalLM
+        except ImportError:
+            error_message = "Failed to import module 'optimum'"
+            installation_guide = [
+                "Please make sure 'optimum' is installed. ",
+                "You can install it by `pip install optimum[openvino,nncf]`\n",
+            ]
+
+            raise ImportError(f"{error_message}\n\n{''.join(installation_guide)}")
+
+        from transformers import AutoTokenizer
+
+        tokenizer = AutoTokenizer.from_pretrained(
+            self.model_path,
+            use_fast=self._use_fast_tokenizer,
+            trust_remote_code=kwargs["trust_remote_code"],
+            revision=kwargs["revision"],
+        )
+        ov_path = self._convert_hf_to_ov(self.model_path, kwargs["revision"])
+        model = OVModelForCausalLM.from_pretrained(ov_path)
+        return model, tokenizer
+
+    def _convert_hf_to_ov(self, model_path: str, revision: str) -> str:
+        from optimum.intel import OVModelForCausalLM
+
+        from ..llm_family import _generate_meta_file, valid_model_revision
+
+        root_dir = os.path.dirname(os.path.dirname(model_path))
+        ov_dir = os.path.join(
+            root_dir,
+            "{}-ov-{}b".format(
+                self.model_family.model_name, self.model_spec.model_size_in_billions
+            ),
+        )
+        meta_path = os.path.join(ov_dir, "__valid_download")
+        if os.path.exists(meta_path):
+            logger.info("Skip converting huggingface model to OpenVINO model")
+            valid_model_revision(meta_path, revision)
+            return ov_dir
+
+        if not os.path.exists(ov_dir):
+            os.makedirs(ov_dir)
+
+        logger.info("Convert model to OpenVINO")
+        model = OVModelForCausalLM.from_pretrained(model_path, export=True)
+        model.save_pretrained(ov_dir)
+        _generate_meta_file(
+            meta_path, self.model_family, self.model_spec, self.quantization
+        )
+        return ov_dir
+
+    def load(self):
+        device = self._model_config.get("device", "auto")
+        self._model_config["device"] = select_device(device)
+        self._device = self._model_config["device"]
+
+        kwargs = {}
+        kwargs["revision"] = self._model_config.get(
+            "revision", self.model_spec.model_revision
+        )
+        kwargs["trust_remote_code"] = self._model_config.get("trust_remote_code")
+
+        self._model, self._tokenizer = self._load_model(**kwargs)
+
+    @classmethod
+    def match(
+        cls, llm_family: "LLMFamilyV1", llm_spec: "LLMSpecV1", quantization: str
+    ) -> bool:
+        if llm_spec.model_format not in ["pytorch"]:
+            return False
+        if "generate" not in llm_family.model_ability:
+            return False
+        return True
+
+    def generate(
+        self, prompt: str, generate_config: Optional[OpenVINOGenerateConfig] = None
+    ) -> Union[Completion, Iterator[CompletionChunk]]:
+        from ..pytorch.utils import generate_stream
+
+        def generator_wrapper(
+            prompt: str, generate_config: OpenVINOGenerateConfig
+        ) -> Iterator[CompletionChunk]:
+            for completion_chunk, completion_usage in generate_stream(
+                self.model_uid,
+                self._model,
+                self._tokenizer,
+                prompt,
+                self._device,
+                generate_config,
+            ):
+                completion_chunk["usage"] = completion_usage
+                yield completion_chunk
+
+        logger.debug(
+            "Enter generate, prompt: %s, generate config: %s", prompt, generate_config
+        )
+
+        generate_config = self._sanitize_generate_config(generate_config)
+
+        assert self._model is not None
+        assert self._tokenizer is not None
+
+        stream = generate_config.get("stream", False)
+        if not stream:
+            for completion_chunk, completion_usage in generate_stream(
+                self.model_uid,
+                self._model,
+                self._tokenizer,
+                prompt,
+                self._device,
+                generate_config,
+            ):
+                pass
+            completion = Completion(
+                id=completion_chunk["id"],
+                object=completion_chunk["object"],
+                created=completion_chunk["created"],
+                model=completion_chunk["model"],
+                choices=completion_chunk["choices"],
+                usage=completion_usage,
+            )
+            return completion
+        else:
+            return generator_wrapper(prompt, generate_config)
+
+
+class OpenVINOChatModel(OpenVINOModel, ChatModelMixin):
+    def __init__(
+        self,
+        model_uid: str,
+        model_family: "LLMFamilyV1",
+        model_spec: "LLMSpecV1",
+        quantization: str,
+        model_path: str,
+        model_config: Optional[OpenVINOModelConfig] = None,
+        peft_model: Optional[List[LoRA]] = None,
+    ):
+        super().__init__(
+            model_uid,
+            model_family,
+            model_spec,
+            quantization,
+            model_path,
+            model_config,
+            peft_model,
+        )
+
+    def _sanitize_generate_config(
+        self,
+        generate_config: Optional[OpenVINOGenerateConfig],
+    ) -> OpenVINOGenerateConfig:
+        generate_config = super()._sanitize_generate_config(generate_config)
+        if (
+            (not generate_config.get("stop"))
+            and self.model_family.prompt_style
+            and self.model_family.prompt_style.stop
+        ):
+            generate_config["stop"] = self.model_family.prompt_style.stop.copy()
+        if (
+            generate_config.get("stop_token_ids", None) is None
+            and self.model_family.prompt_style
+            and self.model_family.prompt_style.stop_token_ids
+        ):
+            generate_config[
+                "stop_token_ids"
+            ] = self.model_family.prompt_style.stop_token_ids.copy()
+
+        return generate_config
+
+    @classmethod
+    def match(
+        cls, llm_family: "LLMFamilyV1", llm_spec: "LLMSpecV1", quantization: str
+    ) -> bool:
+        if llm_spec.model_format not in ["pytorch"]:
+            return False
+        if "chat" not in llm_family.model_ability:
+            return False
+        return True
+
+    def chat(
+        self,
+        prompt: str,
+        system_prompt: Optional[str] = None,
+        chat_history: Optional[List[ChatCompletionMessage]] = None,
+        generate_config: Optional[OpenVINOGenerateConfig] = None,
+    ) -> Union[ChatCompletion, Iterator[ChatCompletionChunk]]:
+        tools = generate_config.pop("tools", []) if generate_config else None
+        full_prompt = self._get_full_prompt(prompt, system_prompt, chat_history, tools)
+
+        generate_config = self._sanitize_generate_config(generate_config)
+        # TODO(codingl2k1): qwen hacky to set stop for function call.
+        model_family = self.model_family.model_family or self.model_family.model_name
+        if tools and model_family in ["qwen-chat", "qwen1.5-chat"]:
+            stop = generate_config.get("stop")
+            if isinstance(stop, str):
+                generate_config["stop"] = [stop, "Observation:"]
+            elif isinstance(stop, Iterable):
+                assert not isinstance(stop, str)
+                generate_config["stop"] = list(stop) + ["Observation:"]
+            else:
+                generate_config["stop"] = "Observation:"
+
+        stream = generate_config.get("stream", False)
+        if stream:
+            it = self.generate(full_prompt, generate_config)
+            assert isinstance(it, Iterator)
+            return self._to_chat_completion_chunks(it)
+        else:
+            c = self.generate(full_prompt, generate_config)
+            assert not isinstance(c, Iterator)
+            if tools:
+                return self._tool_calls_completion(
+                    self.model_family, self.model_uid, c, tools
+                )
+            return self._to_chat_completion(c)
+
+    def _get_full_prompt(self, prompt, system_prompt, chat_history, tools):
+        assert self.model_family.prompt_style is not None
+        prompt_style = self.model_family.prompt_style.copy()
+        if system_prompt:
+            prompt_style.system_prompt = system_prompt
+        chat_history = chat_history or []
+        full_prompt = ChatModelMixin.get_prompt(
+            prompt, chat_history, prompt_style, tools=tools
+        )
+        return full_prompt