From ee0e2a2749e3b77c3f814bf1a86afbf3955105b5 Mon Sep 17 00:00:00 2001
From: llyycchhee <mozer2019@163.com>
Date: Thu, 3 Jul 2025 06:18:24 +0000
Subject: [PATCH 1/6] fix(mistral): add model file

---
 xinference/model/llm/llm_family.json          |  39 ++++
 .../llm/transformers/multimodal/mistral3.py   | 170 ++++++++++++++++++
 xinference/model/llm/vllm/core.py             |   4 +
 3 files changed, 213 insertions(+)
 create mode 100644 xinference/model/llm/transformers/multimodal/mistral3.py
diff --git a/xinference/model/llm/llm_family.json b/xinference/model/llm/llm_family.json
index 070bff43c1..81ddcd9742 100644
--- a/xinference/model/llm/llm_family.json
+++ b/xinference/model/llm/llm_family.json
@@ -10561,5 +10561,44 @@
     ],
     "reasoning_start_tag": "<think>",
     "reasoning_end_tag": "</think>"
+  },
+  {
+    "version": 1,
+    "context_length": 131072,
+    "model_name": "mistral-small-3.2-instruct",
+    "model_lang": [
+      "en",
+      "fr",
+      "de",
+      "es",
+      "it",
+      "pt",
+      "zh",
+      "ru",
+      "ja",
+      "ko"
+    ],
+    "model_ability": [
+      "chat",
+      "vision"
+    ],
+    "model_description": "Mistral-Small-3.1 is a 24B parameter model designed for instruction-following tasks, optimized for performance and efficiency. It supports both English and Chinese languages, making it versatile for various applications.",
+    "model_specs": [
+       {
+        "model_format": "awq",
+        "model_size_in_billions": 24,
+        "quantizations": [
+          "Int4"
+        ],
+        "model_id": "gghfez/Mistral-Small-3.2-24B-Instruct-hf-AWQ"
+      }
+    ],
+    "chat_template": "{%- set today = '' %}\n{%- set default_system_message = \"You are Mistral Small 3, a Large Language Model (LLM) created by Mistral AI, a French startup headquartered in Paris.\\nYour knowledge base was last updated on 2023-10-01. The current date is \" + today + \".\\n\\nWhen you're not sure about some information, you say that you don't have the information and don't make up anything.\\nIf the user's question is not clear, ambiguous, or does not provide enough context for you to accurately answer the question, you do not try to answer it right away and you rather ask the user to clarify their request (e.g. \\\"What are some good restaurants around me?\\\" => \\\"Where are you?\\\" or \\\"When is the next flight to Tokyo\\\" => \\\"Where do you travel from?\\\")\" %}\n\n{{- bos_token }}\n\n{%- if messages[0]['role'] == 'system' %}\n    {%- if messages[0]['content'] is string %}\n        {%- set system_message = messages[0]['content'] %}\n    {%- else %}\n        {%- set system_message = messages[0]['content'][0]['text'] %}\n    {%- endif %}\n    {%- set loop_messages = messages[1:] %}\n{%- else %}\n    {%- set system_message = default_system_message %}\n    {%- set loop_messages = messages %}\n{%- endif %}\n{{- '[SYSTEM_PROMPT]' + system_message + '[/SYSTEM_PROMPT]' }}\n\n{%- for message in loop_messages %}\n    {%- if message['role'] == 'user' %}\n        {%- if message['content'] is string %}\n            {{- '[INST]' + message['content'] + '[/INST]' }}\n        {%- else %}\n            {{- '[INST]' }}\n            {%- for block in message['content'] %}\n                {%- if block['type'] == 'text' %}\n                    {{- block['text'] }}\n                {%- elif block['type'] in ['image', 'image_url'] %}\n                    {{- '[IMG]' }}\n                {%- else %}\n                    {{- raise_exception('Only text and image blocks are supported in message content!') }}\n                {%- endif %}\n            {%- endfor %}\n            {{- '[/INST]' }}\n        {%- endif %}\n    {%- elif message['role'] == 'system' %}\n        {%- if message['content'] is string %}\n            {{- '[SYSTEM_PROMPT]' + message['content'] + '[/SYSTEM_PROMPT]' }}\n        {%- else %}\n            {{- '[SYSTEM_PROMPT]' + message['content'][0]['text'] + '[/SYSTEM_PROMPT]' }}\n        {%- endif %}\n    {%- elif message['role'] == 'assistant' %}\n        {%- if message['content'] is string %}\n            {{- message['content'] + eos_token }}\n        {%- else %}\n            {{- message['content'][0]['text'] + eos_token }}\n        {%- endif %}\n    {%- else %}\n        {{- raise_exception('Only user, system and assistant roles are supported!') }}\n    {%- endif %}\n{%- endfor %}",
+    "stop_token_ids": [
+      2
+    ],
+    "stop": [
+      "</s>"
+    ]
   }
 ]
diff --git a/xinference/model/llm/transformers/multimodal/mistral3.py b/xinference/model/llm/transformers/multimodal/mistral3.py
new file mode 100644
index 0000000000..a2e90e61da
--- /dev/null
+++ b/xinference/model/llm/transformers/multimodal/mistral3.py
@@ -0,0 +1,170 @@
+# Copyright 2022-2025 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import importlib.util
+import logging
+from typing import Any, Dict, Iterator, List, Optional, Tuple
+from io import BytesIO
+from PIL import Image
+
+from .....core.model import register_batching_multimodal_models
+from .....core.scheduler import InferenceRequest
+from .....device_utils import is_npu_available
+from .....model.utils import select_device
+from .....types import PytorchModelConfig
+from ...llm_family import LLMFamilyV1, LLMSpecV1, register_transformer
+from ..core import register_non_default_model
+from .core import PytorchMultiModalModel
+
+logger = logging.getLogger(__name__)
+
+
+@register_batching_multimodal_models("mistral-small-3.2-instruct")
+@register_transformer
+@register_non_default_model("mistral-small-3.2-instruct")
+class MistralAWQMultimodalModel(PytorchMultiModalModel):
+    def _sanitize_model_config(
+        self, pytorch_model_config: Optional[PytorchModelConfig]
+    ) -> PytorchModelConfig:
+        pytorch_model_config = super()._sanitize_model_config(pytorch_model_config)
+        assert pytorch_model_config is not None
+        return pytorch_model_config
+
+    @classmethod
+    def match_json(
+        cls, model_family: "LLMFamilyV1", model_spec: "LLMSpecV1", quantization: str
+    ) -> bool:
+        if model_spec.model_format not in ["pytorch", "gptq", "awq"]:
+            return False
+        llm_family = model_family.model_family or model_family.model_name
+        if "mistral-small-3.2-instruct" in llm_family.lower():
+            return True
+        return False
+
+    def decide_device(self):
+        device = self._pytorch_model_config.get("device", "auto")
+        device = select_device(device)
+        self._device = device
+
+    def load_processor(self):
+        from transformers import AutoProcessor
+        from transformers import AutoTokenizer
+
+        
+        min_pixels = self._pytorch_model_config.get("min_pixels")
+        max_pixels = self._pytorch_model_config.get("max_pixels")
+        self._processor = AutoProcessor.from_pretrained(
+            self.model_path,
+            trust_remote_code=True,
+            min_pixels=min_pixels,
+            max_pixels=max_pixels,
+        )
+        # 加载 tokenizer
+        self._tokenizer = AutoTokenizer.from_pretrained(self.model_path, trust_remote_code=True)
+
+    def load_multimodal_model(self):
+        from transformers import Mistral3ForConditionalGeneration
+
+        kwargs = self.apply_bnb_quantization()
+        device = "auto" if self._device == "cuda" else self._device
+        
+        self._model = Mistral3ForConditionalGeneration.from_pretrained(
+            self.model_path, 
+            device_map="cuda",
+            torch_dtype="bfloat16",
+            **kwargs
+        ).eval()
+
+    def build_inputs_from_messages(
+        self, messages: List[Dict], generate_config: Dict
+    ):
+        messages = self._transform_messages(messages)
+        inputs = self._processor.apply_chat_template(
+            messages,
+            add_generation_prompt=True,
+            tokenize=True,
+            return_dict=True,
+            return_tensors="pt",
+        ).to(self._device)
+        return inputs
+
+    def build_generate_kwargs(self, generate_config: Dict) -> Dict[str, Any]:
+        return dict(
+            max_new_tokens=generate_config.get("max_tokens", 512),
+            temperature=generate_config.get("temperature", 1),
+        )
+
+    def build_streaming_iter(
+        self,
+        messages: List[Dict],
+        generate_config: Dict,
+    ) -> Tuple[Iterator, int]:
+        from threading import Thread
+        from transformers import TextIteratorStreamer
+
+        inputs = self.build_inputs_from_messages(messages, generate_config)
+        configs = self.build_generate_kwargs(generate_config)
+
+        tokenizer = self._tokenizer
+        streamer = TextIteratorStreamer(
+            tokenizer, timeout=60.0, skip_prompt=True, skip_special_tokens=True
+        )
+
+        gen_kwargs = {"streamer": streamer, **inputs, **configs}
+        t = Thread(target=self._model.generate, kwargs=gen_kwargs)
+        t.start()
+        return streamer, len(inputs.input_ids[0])
+
+    def prepare_sanitize_generate_config(self, req: InferenceRequest):
+        from transformers import GenerationConfig
+
+        gen_config = GenerationConfig.from_pretrained(self.model_path).to_dict()
+        raw_config = req.inference_kwargs.get("raw_params", {})
+        gen_config.update(raw_config)
+        return gen_config
+
+    def _get_full_prompt(self, messages: List[Dict], tools, generate_config: dict):
+        return messages
+
+    def build_prefill_kwargs(self, prompts: List, req_list: List[InferenceRequest]):
+        import torch
+
+        texts = []
+        for p in prompts:
+            if hasattr(self._tokenizer, "apply_chat_template"):
+                text = self._tokenizer.apply_chat_template(
+                    [
+                        {"role": "user", "content": p.get("content", "")}
+                    ], tokenize=False, add_generation_prompt=True
+                )
+            else:
+                text = p.get("content", "")
+            texts.append(text)
+
+        inputs = self._tokenizer(
+            texts,
+            return_tensors="pt",
+            padding=True,
+            padding_side="left"
+        ).to(self._device)
+
+        for r, ids, attn_mask in zip(req_list, inputs.input_ids, inputs.attention_mask):
+            r.prompt_tokens = ids.tolist()
+            real_len = torch.sum(attn_mask).item()
+            r.padding_len = attn_mask.numel() - real_len
+            r.extra_kwargs["attention_mask_seq_len"] = real_len
+
+        batch_size, seq_len = inputs.input_ids.shape
+        position_ids = self.build_prefill_position_ids(batch_size, seq_len, req_list)
+        return {**inputs, "position_ids": position_ids}
\ No newline at end of file
diff --git a/xinference/model/llm/vllm/core.py b/xinference/model/llm/vllm/core.py
index 38bdb1f839..a4c12a5a05 100644
--- a/xinference/model/llm/vllm/core.py
+++ b/xinference/model/llm/vllm/core.py
@@ -248,6 +248,10 @@ class VLLMGenerateConfig(TypedDict, total=False):
     VLLM_SUPPORTED_CHAT_MODELS.append("gemma-3-1b-it")
     VLLM_SUPPORTED_VISION_MODEL_LIST.append("gemma-3-it")
 
+if VLLM_INSTALLED and vllm.__version__ >= "0.8.1":
+    VLLM_SUPPORTED_CHAT_MODELS.append("mistral-small-3.2-instruct")
+    VLLM_SUPPORTED_VISION_MODEL_LIST.append("mistral-small-3.2-instruct")
+
 if VLLM_INSTALLED and vllm.__version__ >= "0.8.4":
     VLLM_SUPPORTED_CHAT_MODELS.append("glm4-0414")
 

From bc98cf0c27fc5151c33c265a1ebd060df6c9e449 Mon Sep 17 00:00:00 2001
From: arthur <arthur.ai@orbitai.ai>
Date: Mon, 21 Jul 2025 09:01:51 +0000
Subject: [PATCH 2/6] feat(model): mistral3.2 support

---
 xinference/model/llm/llm_family.json          |  22 ++-
 .../llm/transformers/multimodal/mistral3.py   | 172 ++++++++++--------
 2 files changed, 118 insertions(+), 76 deletions(-)

diff --git a/xinference/model/llm/llm_family.json b/xinference/model/llm/llm_family.json
index 81ddcd9742..d280b71d3d 100644
--- a/xinference/model/llm/llm_family.json
+++ b/xinference/model/llm/llm_family.json
@@ -10582,18 +10582,34 @@
       "chat",
       "vision"
     ],
-    "model_description": "Mistral-Small-3.1 is a 24B parameter model designed for instruction-following tasks, optimized for performance and efficiency. It supports both English and Chinese languages, making it versatile for various applications.",
+    "model_description": "Mistral-Small-3.2 is a 24B parameter model designed for instruction-following tasks, optimized for performance and efficiency. It supports both English and Chinese languages, making it versatile for various applications.",
     "model_specs": [
-       {
+      {
         "model_format": "awq",
         "model_size_in_billions": 24,
         "quantizations": [
           "Int4"
         ],
         "model_id": "gghfez/Mistral-Small-3.2-24B-Instruct-hf-AWQ"
+      },
+      {
+        "model_format": "pytorch",
+        "model_size_in_billions": 24,
+        "quantizations": [
+          "none"
+        ],
+        "model_id": "unsloth/Mistral-Small-3.2-24B-Instruct-2506-bnb-4bit"
+      },
+      {
+        "model_format": "fp8",
+        "model_size_in_billions": 24,
+        "quantizations": [
+          "fp8"
+        ],
+        "model_id": "RedHatAI/Mistral-Small-3.2-24B-Instruct-2506-FP8"
       }
     ],
-    "chat_template": "{%- set today = '' %}\n{%- set default_system_message = \"You are Mistral Small 3, a Large Language Model (LLM) created by Mistral AI, a French startup headquartered in Paris.\\nYour knowledge base was last updated on 2023-10-01. The current date is \" + today + \".\\n\\nWhen you're not sure about some information, you say that you don't have the information and don't make up anything.\\nIf the user's question is not clear, ambiguous, or does not provide enough context for you to accurately answer the question, you do not try to answer it right away and you rather ask the user to clarify their request (e.g. \\\"What are some good restaurants around me?\\\" => \\\"Where are you?\\\" or \\\"When is the next flight to Tokyo\\\" => \\\"Where do you travel from?\\\")\" %}\n\n{{- bos_token }}\n\n{%- if messages[0]['role'] == 'system' %}\n    {%- if messages[0]['content'] is string %}\n        {%- set system_message = messages[0]['content'] %}\n    {%- else %}\n        {%- set system_message = messages[0]['content'][0]['text'] %}\n    {%- endif %}\n    {%- set loop_messages = messages[1:] %}\n{%- else %}\n    {%- set system_message = default_system_message %}\n    {%- set loop_messages = messages %}\n{%- endif %}\n{{- '[SYSTEM_PROMPT]' + system_message + '[/SYSTEM_PROMPT]' }}\n\n{%- for message in loop_messages %}\n    {%- if message['role'] == 'user' %}\n        {%- if message['content'] is string %}\n            {{- '[INST]' + message['content'] + '[/INST]' }}\n        {%- else %}\n            {{- '[INST]' }}\n            {%- for block in message['content'] %}\n                {%- if block['type'] == 'text' %}\n                    {{- block['text'] }}\n                {%- elif block['type'] in ['image', 'image_url'] %}\n                    {{- '[IMG]' }}\n                {%- else %}\n                    {{- raise_exception('Only text and image blocks are supported in message content!') }}\n                {%- endif %}\n            {%- endfor %}\n            {{- '[/INST]' }}\n        {%- endif %}\n    {%- elif message['role'] == 'system' %}\n        {%- if message['content'] is string %}\n            {{- '[SYSTEM_PROMPT]' + message['content'] + '[/SYSTEM_PROMPT]' }}\n        {%- else %}\n            {{- '[SYSTEM_PROMPT]' + message['content'][0]['text'] + '[/SYSTEM_PROMPT]' }}\n        {%- endif %}\n    {%- elif message['role'] == 'assistant' %}\n        {%- if message['content'] is string %}\n            {{- message['content'] + eos_token }}\n        {%- else %}\n            {{- message['content'][0]['text'] + eos_token }}\n        {%- endif %}\n    {%- else %}\n        {{- raise_exception('Only user, system and assistant roles are supported!') }}\n    {%- endif %}\n{%- endfor %}",
+    "chat_template": "{%- set today = strftime_now('%Y-%m-%d') %}\n{%- set default_system_message = \"You are Mistral Small 3, a Large Language Model (LLM) created by Mistral AI, a French startup headquartered in Paris.\\nYour knowledge base was last updated on 2023-10-01. The current date is \" + today + \".\\n\\nWhen you're not sure about some information, you say that you don't have the information and don't make up anything.\\nIf the user's question is not clear, ambiguous, or does not provide enough context for you to accurately answer the question, you do not try to answer it right away and you rather ask the user to clarify their request (e.g. \\\"What are some good restaurants around me?\\\" => \\\"Where are you?\\\" or \\\"When is the next flight to Tokyo\\\" => \\\"Where do you travel from?\\\")\" %}\n\n{{- bos_token }}\n\n{%- if messages[0]['role'] == 'system' %}\n    {%- if messages[0]['content'] is string %}\n        {%- set system_message = messages[0]['content'] %}\n    {%- else %}\n        {%- set system_message = messages[0]['content'][0]['text'] %}\n    {%- endif %}\n    {%- set loop_messages = messages[1:] %}\n{%- else %}\n    {%- set system_message = default_system_message %}\n    {%- set loop_messages = messages %}\n{%- endif %}\n{{- '[SYSTEM_PROMPT]' + system_message + '[/SYSTEM_PROMPT]' }}\n\n{%- for message in loop_messages %}\n    {%- if message['role'] == 'user' %}\n        {%- if message['content'] is string %}\n            {{- '[INST]' + message['content'] + '[/INST]' }}\n        {%- else %}\n            {{- '[INST]' }}\n            {%- for block in message['content'] %}\n                {%- if block['type'] == 'text' %}\n                    {{- block['text'] }}\n                {%- elif block['type'] in ['image', 'image_url'] %}\n                    {{- '[IMG]' }}\n                {%- else %}\n                    {{- raise_exception('Only text and image blocks are supported in message content!') }}\n                {%- endif %}\n            {%- endfor %}\n            {{- '[/INST]' }}\n        {%- endif %}\n    {%- elif message['role'] == 'system' %}\n        {%- if message['content'] is string %}\n            {{- '[SYSTEM_PROMPT]' + message['content'] + '[/SYSTEM_PROMPT]' }}\n        {%- else %}\n            {{- '[SYSTEM_PROMPT]' + message['content'][0]['text'] + '[/SYSTEM_PROMPT]' }}\n        {%- endif %}\n    {%- elif message['role'] == 'assistant' %}\n        {%- if message['content'] is string %}\n            {{- message['content'] + eos_token }}\n        {%- else %}\n            {{- message['content'][0]['text'] + eos_token }}\n        {%- endif %}\n    {%- else %}\n        {{- raise_exception('Only user, system and assistant roles are supported!') }}\n    {%- endif %}\n{%- endfor %}",
     "stop_token_ids": [
       2
     ],
diff --git a/xinference/model/llm/transformers/multimodal/mistral3.py b/xinference/model/llm/transformers/multimodal/mistral3.py
index a2e90e61da..22c25328e2 100644
--- a/xinference/model/llm/transformers/multimodal/mistral3.py
+++ b/xinference/model/llm/transformers/multimodal/mistral3.py
@@ -11,29 +11,24 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-
-import importlib.util
 import logging
+from concurrent.futures import ThreadPoolExecutor
 from typing import Any, Dict, Iterator, List, Optional, Tuple
-from io import BytesIO
-from PIL import Image
 
-from .....core.model import register_batching_multimodal_models
-from .....core.scheduler import InferenceRequest
-from .....device_utils import is_npu_available
+import torch
+
 from .....model.utils import select_device
 from .....types import PytorchModelConfig
+from ...utils import _decode_image
 from ...llm_family import LLMFamilyV1, LLMSpecV1, register_transformer
 from ..core import register_non_default_model
 from .core import PytorchMultiModalModel
 
 logger = logging.getLogger(__name__)
 
-
-@register_batching_multimodal_models("mistral-small-3.2-instruct")
 @register_transformer
 @register_non_default_model("mistral-small-3.2-instruct")
-class MistralAWQMultimodalModel(PytorchMultiModalModel):
+class MistralMultimodalModel(PytorchMultiModalModel):
     def _sanitize_model_config(
         self, pytorch_model_config: Optional[PytorchModelConfig]
     ) -> PytorchModelConfig:
@@ -53,15 +48,12 @@ def match_json(
         return False
 
     def decide_device(self):
-        device = self._pytorch_model_config.get("device", "auto")
-        device = select_device(device)
-        self._device = device
+        device = self._pytorch_model_config.get("device", "cuda")
+        self._device = select_device(device)
 
     def load_processor(self):
         from transformers import AutoProcessor
         from transformers import AutoTokenizer
-
-        
         min_pixels = self._pytorch_model_config.get("min_pixels")
         max_pixels = self._pytorch_model_config.get("max_pixels")
         self._processor = AutoProcessor.from_pretrained(
@@ -70,39 +62,113 @@ def load_processor(self):
             min_pixels=min_pixels,
             max_pixels=max_pixels,
         )
-        # 加载 tokenizer
-        self._tokenizer = AutoTokenizer.from_pretrained(self.model_path, trust_remote_code=True)
+        self._tokenizer = AutoTokenizer.from_pretrained(
+            self.model_path, trust_remote_code=True, use_fast=False
+        )
+        
 
     def load_multimodal_model(self):
+        from transformers import BitsAndBytesConfig
         from transformers import Mistral3ForConditionalGeneration
-
-        kwargs = self.apply_bnb_quantization()
-        device = "auto" if self._device == "cuda" else self._device
+        kwargs = {"device_map": self._device}
+        kwargs = self.apply_bnb_quantization(kwargs)
         
+        if '4bit' in self.model_path:
+            quantization_config = BitsAndBytesConfig(load_in_4bit=True)
+            kwargs["quantization_config"] = quantization_config
+        elif '8bit' in self.model_path:
+            quantization_config = BitsAndBytesConfig(load_in_8bit=True)
+            kwargs["quantization_config"] = quantization_config
+
         self._model = Mistral3ForConditionalGeneration.from_pretrained(
             self.model_path, 
-            device_map="cuda",
-            torch_dtype="bfloat16",
+            low_cpu_mem_usage=True,
+            trust_remote_code=True,
+            torch_dtype=torch.float16,
             **kwargs
         ).eval()
+        # if self._device == 'cuda':
+        #     self._model.cuda()
 
+                
+    @staticmethod
+    def _get_processed_msgs(messages: List[Dict]) -> List[Dict]:
+        res = []
+        texts = []
+        images = []
+        for message in messages:
+            role = message["role"]
+            content = message["content"]
+            if isinstance(content, str):
+                res.append({"role": role, "content": [{"type": "text", "text": content}]})
+                texts.append(content)
+            else:
+                texts = []
+                image_urls = []
+                for c in content:
+                    c_type = c.get("type")
+                    if c_type == "text":
+                        texts.append(c["text"])
+                    else:
+                        assert (
+                            c_type == "image_url"
+                        ), "Please follow the image input of the OpenAI API."
+                        image_urls.append(c["image_url"]["url"])
+                if len(image_urls) > 1:
+                    raise RuntimeError("Only one image per message is supported")
+                image_futures = []
+                with ThreadPoolExecutor() as executor:
+                    for image_url in image_urls:
+                        fut = executor.submit(_decode_image, image_url)
+                        image_futures.append(fut)
+                images = [fut.result() for fut in image_futures]
+                assert len(images) <= 1
+                text = " ".join(texts)
+                if images:
+                    res.append({"role": role, "content":  [{"type": "image", "image": images[0]}, {"type": "text", "text": text}] })
+                    texts.append(text)
+                    images.append(images[0])
+                else:
+                    texts.append(text)
+                    res.append({"role": role, "content": [{"type": "text", "text": text}]})
+        return res,texts,images
+    
+    @staticmethod
+    def flatten_content(msg):
+        if isinstance(msg["content"], list):
+            parts = []
+            for part in msg["content"]:
+                if part["type"] == "image":
+                    parts.append("<image>")  # 或者其他占位符
+                elif part["type"] == "text":
+                    parts.append(part["text"])
+            msg["content"] = "".join(parts)
+        return msg
+    
     def build_inputs_from_messages(
         self, messages: List[Dict], generate_config: Dict
     ):
-        messages = self._transform_messages(messages)
-        inputs = self._processor.apply_chat_template(
-            messages,
+        rst, text, images = self._get_processed_msgs(messages)
+        flattened_messages = [self.flatten_content(m.copy()) for m in rst]
+        inputs = self._tokenizer.apply_chat_template(
+            conversation=flattened_messages,
+            # text=text,
+            images=images,
             add_generation_prompt=True,
             tokenize=True,
-            return_dict=True,
             return_tensors="pt",
-        ).to(self._device)
+            return_dict=True,
+        )
+        inputs = inputs.to(self._device)
         return inputs
 
     def build_generate_kwargs(self, generate_config: Dict) -> Dict[str, Any]:
         return dict(
-            max_new_tokens=generate_config.get("max_tokens", 512),
+            max_new_tokens=generate_config.get("max_tokens", 1000),
             temperature=generate_config.get("temperature", 1),
+            eos_token_id=generate_config.get("eos_token_id", 2),
+            do_sample=generate_config.get("do_sample", True),
+            bos_token_id=generate_config.get("bos_token_id", 1),
         )
 
     def build_streaming_iter(
@@ -118,53 +184,13 @@ def build_streaming_iter(
 
         tokenizer = self._tokenizer
         streamer = TextIteratorStreamer(
-            tokenizer, timeout=60.0, skip_prompt=True, skip_special_tokens=True
+            tokenizer, 
+            timeout=60.0,
+            skip_prompt=True,
+            skip_special_tokens=True
         )
 
         gen_kwargs = {"streamer": streamer, **inputs, **configs}
         t = Thread(target=self._model.generate, kwargs=gen_kwargs)
         t.start()
-        return streamer, len(inputs.input_ids[0])
-
-    def prepare_sanitize_generate_config(self, req: InferenceRequest):
-        from transformers import GenerationConfig
-
-        gen_config = GenerationConfig.from_pretrained(self.model_path).to_dict()
-        raw_config = req.inference_kwargs.get("raw_params", {})
-        gen_config.update(raw_config)
-        return gen_config
-
-    def _get_full_prompt(self, messages: List[Dict], tools, generate_config: dict):
-        return messages
-
-    def build_prefill_kwargs(self, prompts: List, req_list: List[InferenceRequest]):
-        import torch
-
-        texts = []
-        for p in prompts:
-            if hasattr(self._tokenizer, "apply_chat_template"):
-                text = self._tokenizer.apply_chat_template(
-                    [
-                        {"role": "user", "content": p.get("content", "")}
-                    ], tokenize=False, add_generation_prompt=True
-                )
-            else:
-                text = p.get("content", "")
-            texts.append(text)
-
-        inputs = self._tokenizer(
-            texts,
-            return_tensors="pt",
-            padding=True,
-            padding_side="left"
-        ).to(self._device)
-
-        for r, ids, attn_mask in zip(req_list, inputs.input_ids, inputs.attention_mask):
-            r.prompt_tokens = ids.tolist()
-            real_len = torch.sum(attn_mask).item()
-            r.padding_len = attn_mask.numel() - real_len
-            r.extra_kwargs["attention_mask_seq_len"] = real_len
-
-        batch_size, seq_len = inputs.input_ids.shape
-        position_ids = self.build_prefill_position_ids(batch_size, seq_len, req_list)
-        return {**inputs, "position_ids": position_ids}
\ No newline at end of file
+        return streamer, len(inputs["input_ids"][0])

From f3c71c7acf6e3e7d8723efc783c3006f429eb00b Mon Sep 17 00:00:00 2001
From: lychee <arthur.ai@orbitai.ai>
Date: Mon, 21 Jul 2025 09:45:33 +0000
Subject: [PATCH 3/6] feat(model): modify json

---
 xinference/model/llm/llm_family.json          | 43 +++++++++++++++++++
 .../llm/transformers/multimodal/mistral3.py   |  4 +-
 2 files changed, 45 insertions(+), 2 deletions(-)

diff --git a/xinference/model/llm/llm_family.json b/xinference/model/llm/llm_family.json
index 792d2cc648..20ddcbe197 100644
--- a/xinference/model/llm/llm_family.json
+++ b/xinference/model/llm/llm_family.json
@@ -18249,5 +18249,48 @@
     ],
     "reasoning_start_tag": "<think>",
     "reasoning_end_tag": "</think>"
+  },
+  {
+    "version": 2,
+    "context_length": 131072,
+    "model_name": "mistral-small-3.2-instruct",
+    "model_lang": [
+      "en",
+      "fr",
+      "de",
+      "es",
+      "it",
+      "pt",
+      "zh",
+      "ru",
+      "ja",
+      "ko"
+    ],
+    "model_ability": [
+      "chat",
+      "vision"
+    ],
+    "model_description": "Mistral-Small-3.2 is a 24B parameter model designed for instruction-following tasks, optimized for performance and efficiency. It supports both English and Chinese languages, making it versatile for various applications.",
+    "model_specs": [
+      {
+        "model_format": "pytorch",
+        "model_size_in_billions": 24,
+        "model_src": {
+          "huggingface": {
+            "quantizations": [
+              "none"
+            ],
+            "model_id": "unsloth/Mistral-Small-3.2-24B-Instruct-2506-bnb-4bit"
+          }
+        }
+      }
+    ],
+    "chat_template": "{%- set today = strftime_now('%Y-%m-%d') %}\n{%- set default_system_message = \"You are Mistral Small 3, a Large Language Model (LLM) created by Mistral AI, a French startup headquartered in Paris.\\nYour knowledge base was last updated on 2023-10-01. The current date is \" + today + \".\\n\\nWhen you're not sure about some information, you say that you don't have the information and don't make up anything.\\nIf the user's question is not clear, ambiguous, or does not provide enough context for you to accurately answer the question, you do not try to answer it right away and you rather ask the user to clarify their request (e.g. \\\"What are some good restaurants around me?\\\" => \\\"Where are you?\\\" or \\\"When is the next flight to Tokyo\\\" => \\\"Where do you travel from?\\\")\" %}\n\n{{- bos_token }}\n\n{%- if messages[0]['role'] == 'system' %}\n    {%- if messages[0]['content'] is string %}\n        {%- set system_message = messages[0]['content'] %}\n    {%- else %}\n        {%- set system_message = messages[0]['content'][0]['text'] %}\n    {%- endif %}\n    {%- set loop_messages = messages[1:] %}\n{%- else %}\n    {%- set system_message = default_system_message %}\n    {%- set loop_messages = messages %}\n{%- endif %}\n{{- '[SYSTEM_PROMPT]' + system_message + '[/SYSTEM_PROMPT]' }}\n\n{%- for message in loop_messages %}\n    {%- if message['role'] == 'user' %}\n        {%- if message['content'] is string %}\n            {{- '[INST]' + message['content'] + '[/INST]' }}\n        {%- else %}\n            {{- '[INST]' }}\n            {%- for block in message['content'] %}\n                {%- if block['type'] == 'text' %}\n                    {{- block['text'] }}\n                {%- elif block['type'] in ['image', 'image_url'] %}\n                    {{- '[IMG]' }}\n                {%- else %}\n                    {{- raise_exception('Only text and image blocks are supported in message content!') }}\n                {%- endif %}\n            {%- endfor %}\n            {{- '[/INST]' }}\n        {%- endif %}\n    {%- elif message['role'] == 'system' %}\n        {%- if message['content'] is string %}\n            {{- '[SYSTEM_PROMPT]' + message['content'] + '[/SYSTEM_PROMPT]' }}\n        {%- else %}\n            {{- '[SYSTEM_PROMPT]' + message['content'][0]['text'] + '[/SYSTEM_PROMPT]' }}\n        {%- endif %}\n    {%- elif message['role'] == 'assistant' %}\n        {%- if message['content'] is string %}\n            {{- message['content'] + eos_token }}\n        {%- else %}\n            {{- message['content'][0]['text'] + eos_token }}\n        {%- endif %}\n    {%- else %}\n        {{- raise_exception('Only user, system and assistant roles are supported!') }}\n    {%- endif %}\n{%- endfor %}",
+    "stop_token_ids": [
+      2
+    ],
+    "stop": [
+      "</s>"
+    ]
   }
 ]
diff --git a/xinference/model/llm/transformers/multimodal/mistral3.py b/xinference/model/llm/transformers/multimodal/mistral3.py
index 22c25328e2..e1cd1d2ddb 100644
--- a/xinference/model/llm/transformers/multimodal/mistral3.py
+++ b/xinference/model/llm/transformers/multimodal/mistral3.py
@@ -20,7 +20,7 @@
 from .....model.utils import select_device
 from .....types import PytorchModelConfig
 from ...utils import _decode_image
-from ...llm_family import LLMFamilyV1, LLMSpecV1, register_transformer
+from ...llm_family import LLMFamilyV2, LLMSpecV1, register_transformer
 from ..core import register_non_default_model
 from .core import PytorchMultiModalModel
 
@@ -38,7 +38,7 @@ def _sanitize_model_config(
 
     @classmethod
     def match_json(
-        cls, model_family: "LLMFamilyV1", model_spec: "LLMSpecV1", quantization: str
+        cls, model_family: "LLMFamilyV2", model_spec: "LLMSpecV1", quantization: str
     ) -> bool:
         if model_spec.model_format not in ["pytorch", "gptq", "awq"]:
             return False

From 211fbccad26bbaadb203f2899dcbeec4841c570c Mon Sep 17 00:00:00 2001
From: lychee <arthur.ai@orbitai.ai>
Date: Mon, 21 Jul 2025 09:56:36 +0000
Subject: [PATCH 4/6] style(mistral3): modify style

---
 .../llm/transformers/multimodal/mistral3.py   | 59 +++++++++++--------
 1 file changed, 34 insertions(+), 25 deletions(-)

diff --git a/xinference/model/llm/transformers/multimodal/mistral3.py b/xinference/model/llm/transformers/multimodal/mistral3.py
index e1cd1d2ddb..4ad0fa2f31 100644
--- a/xinference/model/llm/transformers/multimodal/mistral3.py
+++ b/xinference/model/llm/transformers/multimodal/mistral3.py
@@ -19,13 +19,14 @@
 
 from .....model.utils import select_device
 from .....types import PytorchModelConfig
-from ...utils import _decode_image
 from ...llm_family import LLMFamilyV2, LLMSpecV1, register_transformer
+from ...utils import _decode_image
 from ..core import register_non_default_model
 from .core import PytorchMultiModalModel
 
 logger = logging.getLogger(__name__)
 
+
 @register_transformer
 @register_non_default_model("mistral-small-3.2-instruct")
 class MistralMultimodalModel(PytorchMultiModalModel):
@@ -52,8 +53,8 @@ def decide_device(self):
         self._device = select_device(device)
 
     def load_processor(self):
-        from transformers import AutoProcessor
-        from transformers import AutoTokenizer
+        from transformers import AutoProcessor, AutoTokenizer
+
         min_pixels = self._pytorch_model_config.get("min_pixels")
         max_pixels = self._pytorch_model_config.get("max_pixels")
         self._processor = AutoProcessor.from_pretrained(
@@ -65,23 +66,22 @@ def load_processor(self):
         self._tokenizer = AutoTokenizer.from_pretrained(
             self.model_path, trust_remote_code=True, use_fast=False
         )
-        
 
     def load_multimodal_model(self):
-        from transformers import BitsAndBytesConfig
-        from transformers import Mistral3ForConditionalGeneration
+        from transformers import BitsAndBytesConfig, Mistral3ForConditionalGeneration
+
         kwargs = {"device_map": self._device}
         kwargs = self.apply_bnb_quantization(kwargs)
-        
-        if '4bit' in self.model_path:
+
+        if "4bit" in self.model_path:
             quantization_config = BitsAndBytesConfig(load_in_4bit=True)
             kwargs["quantization_config"] = quantization_config
-        elif '8bit' in self.model_path:
+        elif "8bit" in self.model_path:
             quantization_config = BitsAndBytesConfig(load_in_8bit=True)
             kwargs["quantization_config"] = quantization_config
 
         self._model = Mistral3ForConditionalGeneration.from_pretrained(
-            self.model_path, 
+            self.model_path,
             low_cpu_mem_usage=True,
             trust_remote_code=True,
             torch_dtype=torch.float16,
@@ -90,9 +90,10 @@ def load_multimodal_model(self):
         # if self._device == 'cuda':
         #     self._model.cuda()
 
-                
     @staticmethod
-    def _get_processed_msgs(messages: List[Dict]) -> List[Dict]:
+    def _get_processed_msgs(
+        messages: List[Dict],
+    ) -> tuple[List[Dict], List[str], List[Any]]:
         res = []
         texts = []
         images = []
@@ -100,7 +101,9 @@ def _get_processed_msgs(messages: List[Dict]) -> List[Dict]:
             role = message["role"]
             content = message["content"]
             if isinstance(content, str):
-                res.append({"role": role, "content": [{"type": "text", "text": content}]})
+                res.append(
+                    {"role": role, "content": [{"type": "text", "text": content}]}
+                )
                 texts.append(content)
             else:
                 texts = []
@@ -125,14 +128,24 @@ def _get_processed_msgs(messages: List[Dict]) -> List[Dict]:
                 assert len(images) <= 1
                 text = " ".join(texts)
                 if images:
-                    res.append({"role": role, "content":  [{"type": "image", "image": images[0]}, {"type": "text", "text": text}] })
+                    res.append(
+                        {
+                            "role": role,
+                            "content": [
+                                {"type": "image", "image": images[0]},
+                                {"type": "text", "text": text},
+                            ],
+                        }
+                    )
                     texts.append(text)
                     images.append(images[0])
                 else:
                     texts.append(text)
-                    res.append({"role": role, "content": [{"type": "text", "text": text}]})
-        return res,texts,images
-    
+                    res.append(
+                        {"role": role, "content": [{"type": "text", "text": text}]}
+                    )
+        return res, texts, images
+
     @staticmethod
     def flatten_content(msg):
         if isinstance(msg["content"], list):
@@ -144,10 +157,8 @@ def flatten_content(msg):
                     parts.append(part["text"])
             msg["content"] = "".join(parts)
         return msg
-    
-    def build_inputs_from_messages(
-        self, messages: List[Dict], generate_config: Dict
-    ):
+
+    def build_inputs_from_messages(self, messages: List[Dict], generate_config: Dict):
         rst, text, images = self._get_processed_msgs(messages)
         flattened_messages = [self.flatten_content(m.copy()) for m in rst]
         inputs = self._tokenizer.apply_chat_template(
@@ -177,6 +188,7 @@ def build_streaming_iter(
         generate_config: Dict,
     ) -> Tuple[Iterator, int]:
         from threading import Thread
+
         from transformers import TextIteratorStreamer
 
         inputs = self.build_inputs_from_messages(messages, generate_config)
@@ -184,10 +196,7 @@ def build_streaming_iter(
 
         tokenizer = self._tokenizer
         streamer = TextIteratorStreamer(
-            tokenizer, 
-            timeout=60.0,
-            skip_prompt=True,
-            skip_special_tokens=True
+            tokenizer, timeout=60.0, skip_prompt=True, skip_special_tokens=True
         )
 
         gen_kwargs = {"streamer": streamer, **inputs, **configs}

From 68080c3e6196b84fcccb744f8c6365f9735030f2 Mon Sep 17 00:00:00 2001
From: lychee <arthur.ai@orbitai.ai>
Date: Mon, 21 Jul 2025 14:33:18 +0000
Subject: [PATCH 5/6] fix(glm-4.1v): resume json

---
 xinference/model/llm/llm_family.json | 94 ++++++++++++++++++++++++++++
 1 file changed, 94 insertions(+)

diff --git a/xinference/model/llm/llm_family.json b/xinference/model/llm/llm_family.json
index 20ddcbe197..7777938677 100644
--- a/xinference/model/llm/llm_family.json
+++ b/xinference/model/llm/llm_family.json
@@ -18250,6 +18250,100 @@
     "reasoning_start_tag": "<think>",
     "reasoning_end_tag": "</think>"
   },
+  {
+    "version": 2,
+    "context_length": 65536,
+    "model_name": "glm-4.1v-thinking",
+    "model_lang": [
+      "en",
+      "zh"
+    ],
+    "model_ability": [
+      "chat",
+      "vision",
+      "reasoning"
+    ],
+    "model_description": "GLM-4.1V-9B-Thinking, designed to explore the upper limits of reasoning in vision-language models.",
+    "model_specs": [
+      {
+        "model_format": "pytorch",
+        "model_size_in_billions": 9,
+        "model_src": {
+          "huggingface": {
+            "quantizations": [
+              "none"
+            ],
+            "model_id": "THUDM/GLM-4.1V-9B-Thinking",
+            "model_revision": "b627c82cd8fc9175ff2b82b33fb439eba260055f"
+          },
+          "modelscope": {
+            "quantizations": [
+              "none"
+            ],
+            "model_id": "ZhipuAI/GLM-4.1V-9B-Thinking",
+            "model_revision": "master"
+          }
+        }
+      },
+      {
+        "model_format": "awq",
+        "model_size_in_billions": 9,
+        "model_src": {
+          "huggingface": {
+            "quantizations": [
+              "Int4"
+            ],
+            "model_id": "dengcao/GLM-4.1V-9B-Thinking-AWQ"
+          },
+          "modelscope": {
+            "quantizations": [
+              "Int4"
+            ],
+            "model_id": "dengcao/GLM-4.1V-9B-Thinking-AWQ",
+            "model_revision": "master"
+          }
+        }
+      },
+      {
+        "model_format": "gptq",
+        "model_size_in_billions": 9,
+        "model_src": {
+          "huggingface": {
+            "quantizations": [
+              "Int4-Int8Mix"
+            ],
+            "model_id": "dengcao/GLM-4.1V-9B-Thinking-GPTQ-Int4-Int8Mix"
+          },
+          "modelscope": {
+            "quantizations": [
+              "Int4-Int8Mix"
+            ],
+            "model_id": "dengcao/GLM-4.1V-9B-Thinking-GPTQ-Int4-Int8Mix",
+            "model_revision": "master"
+          }
+        }
+      }
+    ],
+    "chat_template": "[gMASK]<sop> {%- for msg in messages %} {%- if msg.role == 'system' %} <|system|> {{ msg.content }} {%- elif msg.role == 'user' %} <|user|>{{ '\n' }} {%- if msg.content is string %} {{ msg.content }} {%- else %} {%- for item in msg.content %} {%- if item.type == 'video' or 'video' in item %} <|begin_of_video|><|video|><|end_of_video|> {%- elif item.type == 'image' or 'image' in item %} <|begin_of_image|><|image|><|end_of_image|> {%- elif item.type == 'text' %} {{ item.text }} {%- endif %} {%- endfor %} {%- endif %} {%- elif msg.role == 'assistant' %} {%- if msg.metadata %} <|assistant|>{{ msg.metadata }} {{ msg.content }} {%- else %} <|assistant|> {{ msg.content }} {%- endif %} {%- endif %} {%- endfor %} {% if add_generation_prompt %}<|assistant|> {% endif %}",
+    "stop_token_ids": [
+      151329,
+      151336,
+      151338
+    ],
+    "stop": [
+      "<|endoftext|>",
+      "<|user|>",
+      "<|observation|>"
+    ],
+    "reasoning_start_tag": "<think>",
+    "reasoning_end_tag": "</think>",
+    "virtualenv": {
+      "packages": [
+        "transformers>=4.53.2",
+        "#system_numpy#"
+      ]
+    }
+  },
   {
     "version": 2,
     "context_length": 131072,

From 1cfdf3abac0d2fbb1aeb74d7794e15c4f0cab7c8 Mon Sep 17 00:00:00 2001
From: lychee <arthur.ai@orbitai.ai>
Date: Tue, 22 Jul 2025 04:46:26 +0000
Subject: [PATCH 6/6] fix(glm-4.1v): resume json

---
 xinference/model/llm/llm_family.json | 327 +++++++++++++++++++++++++++
 1 file changed, 327 insertions(+)

diff --git a/xinference/model/llm/llm_family.json b/xinference/model/llm/llm_family.json
index 7777938677..07bce685f0 100644
--- a/xinference/model/llm/llm_family.json
+++ b/xinference/model/llm/llm_family.json
@@ -18250,6 +18250,333 @@
     "reasoning_start_tag": "<think>",
     "reasoning_end_tag": "</think>"
   },
+  {
+    "version": 2,
+    "context_length": 131072,
+    "model_name": "Ernie4.5",
+    "model_lang": [
+      "en",
+      "zh"
+    ],
+    "model_ability": [
+      "chat"
+    ],
+    "model_description": "ERNIE 4.5, a new family of large-scale multimodal models comprising 10 distinct variants.",
+    "model_specs": [
+      {
+        "model_format": "pytorch",
+        "model_size_in_billions": "0_3",
+        "model_src": {
+          "huggingface": {
+            "quantizations": [
+              "none"
+            ],
+            "model_id": "baidu/ERNIE-4.5-0.3B-PT"
+          },
+          "modelscope": {
+            "quantizations": [
+              "none"
+            ],
+            "model_id": "PaddlePaddle/ERNIE-4.5-0.3B-PT"
+          }
+        }
+      },
+      {
+        "model_format": "ggufv2",
+        "model_size_in_billions":  "0_3",
+        "model_src": {
+          "huggingface": {
+            "quantizations": [
+              "Q2_K",
+              "Q2_K_L",
+              "Q3_K_M",
+              "Q3_K_S",
+              "Q4_0",
+              "Q4_1",
+              "Q4_K_M",
+              "Q4_K_S",
+              "Q5_K_M",
+              "Q5_K_S",
+              "Q6_K",
+              "Q8_0",
+              "F16"
+            ],
+            "model_id": "unsloth/ERNIE-4.5-0.3B-PT-GGUF",
+            "model_file_name_template": "ERNIE-4.5-0.3B-PT-{quantization}.gguf"
+          },
+          "modelscope": {
+            "quantizations": [
+              "Q2_K",
+              "Q2_K_L",
+              "Q3_K_M",
+              "Q3_K_S",
+              "Q4_0",
+              "Q4_1",
+              "Q4_K_M",
+              "Q4_K_S",
+              "Q5_K_M",
+              "Q5_K_S",
+              "Q6_K",
+              "Q8_0",
+              "F16"
+            ],
+            "model_id": "unsloth/ERNIE-4.5-0.3B-PT-GGUF",
+            "model_file_name_template": "ERNIE-4.5-0.3B-PT-{quantization}.gguf"
+          }
+        }
+      },
+      {
+        "model_format": "mlx",
+        "model_size_in_billions": "0_3",
+        "model_src": {
+          "huggingface": {
+            "quantizations": [
+              "4bit",
+              "bf16"
+            ],
+            "model_id": "mlx-community/ERNIE-4.5-0.3B-PT-{quantization}"
+          },
+          "modelscope": {
+            "quantizations": [
+              "4bit",
+              "bf16"
+            ],
+            "model_id": "mlx-community/ERNIE-4.5-0.3B-PT-{quantization}"
+          }
+        }
+      },
+      {
+        "model_format": "pytorch",
+        "model_size_in_billions": 21,
+        "activated_size_in_billions": 3,
+        "model_src": {
+          "huggingface": {
+            "quantizations": [
+              "none"
+            ],
+            "model_id": "baidu/ERNIE-4.5-21B-A3B-Base-PT"
+          },
+          "modelscope": {
+            "quantizations": [
+              "none"
+            ],
+            "model_id": "PaddlePaddle/ERNIE-4.5-21B-A3B-Base-PT"
+          }
+        }
+      },
+      {
+        "model_format": "ggufv2",
+        "model_size_in_billions": 21,
+        "activated_size_in_billions": 3,
+        "model_src": {
+          "huggingface": {
+            "quantizations": [
+              "Q2_K",
+              "Q2_K_L",
+              "Q3_K_M",
+              "Q3_K_S",
+              "Q4_0",
+              "Q4_1",
+              "Q4_K_M",
+              "Q4_K_S",
+              "Q5_K_M",
+              "Q5_K_S",
+              "Q6_K",
+              "Q8_0",
+              "BF16"
+            ],
+            "model_id": "unsloth/ERNIE-4.5-21B-A3B-PT-GGUF",
+            "model_file_name_template": "ERNIE-4.5-21B-A3B-PT-{quantization}.gguf"
+          },
+          "modelscope": {
+            "quantizations": [
+              "Q2_K",
+              "Q2_K_L",
+              "Q3_K_M",
+              "Q3_K_S",
+              "Q4_0",
+              "Q4_1",
+              "Q4_K_M",
+              "Q4_K_S",
+              "Q5_K_M",
+              "Q5_K_S",
+              "Q6_K",
+              "Q8_0",
+              "BF16"
+            ],
+            "model_id": "unsloth/ERNIE-4.5-21B-A3B-PT-GGUF",
+            "model_file_name_template": "ERNIE-4.5-21B-A3B-PT-{quantization}.gguf"
+          }
+        }
+      },
+      {
+        "model_format": "mlx",
+        "model_size_in_billions": 21,
+        "activated_size_in_billions": 3,
+        "model_src": {
+          "huggingface": {
+            "quantizations": [
+              "4bit",
+              "5bit",
+              "6bit",
+              "8bit",
+              "bf16"
+            ],
+            "model_id": "mlx-community/ERNIE-4.5-21B-A3B-PT-{quantization}"
+          },
+          "modelscope": {
+            "quantizations": [
+              "4bit",
+              "5bit",
+              "6bit",
+              "8bit",
+              "bf16"
+            ],
+            "model_id": "mlx-community/ERNIE-4.5-21B-A3B-PT-{quantization}"
+          }
+        }
+      },
+      {
+        "model_format": "pytorch",
+        "model_size_in_billions": 300,
+        "activated_size_in_billions": 47,
+        "model_src": {
+          "huggingface": {
+            "quantizations": [
+              "none"
+            ],
+            "model_id": "baidu/ERNIE-4.5-300B-A47B-PT"
+          },
+          "modelscope": {
+            "quantizations": [
+              "none"
+            ],
+            "model_id": "PaddlePaddle/ERNIE-4.5-300B-A47B-PT"
+          }
+        }
+      },
+      {
+        "model_format": "ggufv2",
+        "model_size_in_billions": 300,
+        "activated_size_in_billions": 47,
+        "model_src": {
+          "huggingface": {
+            "quantizations": [
+              "Q2_K",
+              "Q4_K_M",
+              "Q6_K",
+              "Q8_0"
+            ],
+            "quantization_parts": {
+              "Q2_K":  [
+                "00001-of-00003",
+                "00002-of-00003",
+                "00003-of-00003"
+              ],
+              "Q4_K_M":  [
+                "00001-of-00004",
+                "00002-of-00004",
+                "00003-of-00004",
+                "00004-of-00004"
+              ],
+              "Q6_K":  [
+                "00001-of-00005",
+                "00002-of-00005",
+                "00003-of-00005",
+                "00004-of-00005",
+                "00005-of-00005"
+              ],
+              "Q8_0":  [
+                "00001-of-00007",
+                "00002-of-00007",
+                "00003-of-00007",
+                "00004-of-00007",
+                "00005-of-00007",
+                "00006-of-00007",
+                "00007-of-00007"
+              ]
+            },
+            "model_id": "unsloth/ERNIE-4.5-300B-A47B-PT-GGUF",
+            "model_file_name_template": "ERNIE-4.5-0.3B-PT-{quantization}.gguf",
+            "model_file_name_split_template": "{quantization}/ERNIE-4.5-300B-A47B-PT-{quantization}-{part}.gguf"
+          },
+          "modelscope": {
+            "quantizations": [
+              "Q2_K",
+              "Q4_K_M",
+              "Q6_K",
+              "Q8_0"
+            ],
+            "quantization_parts": {
+              "Q2_K":  [
+                "00001-of-00003",
+                "00002-of-00003",
+                "00003-of-00003"
+              ],
+              "Q4_K_M":  [
+                "00001-of-00004",
+                "00002-of-00004",
+                "00003-of-00004",
+                "00004-of-00004"
+              ],
+              "Q6_K":  [
+                "00001-of-00005",
+                "00002-of-00005",
+                "00003-of-00005",
+                "00004-of-00005",
+                "00005-of-00005"
+              ],
+              "Q8_0":  [
+                "00001-of-00007",
+                "00002-of-00007",
+                "00003-of-00007",
+                "00004-of-00007",
+                "00005-of-00007",
+                "00006-of-00007",
+                "00007-of-00007"
+              ]
+            },
+            "model_id": "unsloth/ERNIE-4.5-300B-A47B-PT-GGUF",
+            "model_file_name_template": "ERNIE-4.5-0.3B-PT-{quantization}.gguf",
+            "model_file_name_split_template": "{quantization}/ERNIE-4.5-300B-A47B-PT-{quantization}-{part}.gguf"
+          }
+        }
+      },
+      {
+        "model_format": "mlx",
+        "model_size_in_billions": 300,
+        "activated_size_in_billions": 47,
+        "model_src": {
+          "huggingface": {
+            "quantizations": [
+              "4bit"
+            ],
+            "model_id": "mlx-community/ERNIE-4.5-300B-47B-PT-{quantization}"
+          },
+          "modelscope": {
+            "quantizations": [
+              "4bit"
+            ],
+            "model_id": "mlx-community/ERNIE-4.5-300B-47B-PT-{quantization}"
+          }
+        }
+      }
+    ],
+    "chat_template": "{%- if not add_generation_prompt is defined -%}\n    {%- set add_generation_prompt = true -%}\n{%- endif -%}\n{%- if not cls_token is defined -%}\n    {%- set cls_token = \"<|begin_of_sentence|>\" -%}\n{%- endif -%}\n{%- if not sep_token is defined -%}\n    {%- set sep_token = \"<|end_of_sentence|>\" -%}\n{%- endif -%}\n{{- cls_token -}}\n{%- for message in messages -%}\n    {%- if message[\"role\"] == \"user\" -%}\n        {{- \"User: \" + message[\"content\"] + \"\n\" -}}\n    {%- elif message[\"role\"] == \"assistant\" -%}\n        {{- \"Assistant: \" + message[\"content\"] + sep_token -}}\n    {%- elif message[\"role\"] == \"system\" -%}\n        {{- message[\"content\"] + \"\n\" -}}\n    {%- endif -%}\n{%- endfor -%}\n{%- if add_generation_prompt -%}\n    {{- \"Assistant: \" -}}\n{%- endif -%}",
+    "stop_token_ids": [
+      2
+    ],
+    "stop": [
+      "</s>"
+    ],
+    "virtualenv": {
+      "packages": [
+        "transformers",
+        "mlx-lm>=0.25.2 ; sys_platform=='darwin'",
+        "#system_numpy#"
+      ]
+    }
+  },
   {
     "version": 2,
     "context_length": 65536,