-
Notifications
You must be signed in to change notification settings - Fork 751
Closed as not planned
Description
Feature request / 功能建议
Please add XiaomiMiMo series, includes:
Motivation / 动机
MiMo-VL-7B-RL outperforms Qwen2.5-VL-Instruct-32B in several use cases (OCR, document understanding, etc.).
Your contribution / 您的贡献
I register MiMo-VL-7B-RL as a new model in Xinference and run it with vLLM 0.8.5.post1. It works.
Though in Xinference the gradio page doesn't work, and it cannot parse the <think> tag.
Here is the sample config:
{
"version": 1,
"context_length": 128000,
"model_name": "MiMo-VL-7B-RL",
"model_lang": [
"en",
"zh"
],
"model_ability": [
"generate",
"chat",
"vision",
"reasoning"
],
"model_description": "MiMo-VL-7B-SFT and MiMo-VL-7B-RL are two powerful vision-language models delivering state-of-the-art performance in both general visual understanding and multimodal reasoning.",
"model_family": "qwen2.5-vl-instruct",
"model_specs": [
{
"model_format": "pytorch",
"model_size_in_billions": 7,
"quantizations": [
"none"
],
"model_id": "XiaomiMiMo/MiMo-VL-7B-RL",
"model_hub": "modelscope",
"model_uri": null,
"model_revision": null,
"activated_size_in_billions": null
}
],
"chat_template": "{% set image_count = namespace(value=0) %}{% set video_count = namespace(value=0) %}{% for message in messages %}{% if loop.first and message['role'] != 'system' %}<|im_start|>system\nYou are MiMo, an AI assistant developed by Xiaomi.<|im_end|>\n{% endif %}<|im_start|>{{ message['role'] }}\n{% if message['content'] is string %}{{ message['content'] }}<|im_end|>\n{% else %}{% for content in message['content'] %}{% if content['type'] == 'image' or 'image' in content or 'image_url' in content %}{% set image_count.value = image_count.value + 1 %}{% if add_vision_id %}Picture {{ image_count.value }}: {% endif %}<|vision_start|><|image_pad|><|vision_end|>{% elif content['type'] == 'video' or 'video' in content %}{% set video_count.value = video_count.value + 1 %}{% if add_vision_id %}Video {{ video_count.value }}: {% endif %}<|vision_start|><|video_pad|><|vision_end|>{% elif 'text' in content %}{{ content['text'] }}{% endif %}{% endfor %}<|im_end|>\n{% endif %}{% endfor %}{% if add_generation_prompt %}<|im_start|>assistant\n{% endif %}",
"stop_token_ids": [
151645,
151643
],
"stop": [
"<|im_end|>",
"<|endoftext|>"
],
"reasoning_start_tag": "<think>",
"reasoning_end_tag": "</think>",
"virtualenv": null,
"is_builtin": false
}