diff --git "a/docs/source/Instruction/\346\224\257\346\214\201\347\232\204\346\250\241\345\236\213\345\222\214\346\225\260\346\215\256\351\233\206.md" "b/docs/source/Instruction/\346\224\257\346\214\201\347\232\204\346\250\241\345\236\213\345\222\214\346\225\260\346\215\256\351\233\206.md" index 4f3a18d421..449755f220 100644 --- "a/docs/source/Instruction/\346\224\257\346\214\201\347\232\204\346\250\241\345\236\213\345\222\214\346\225\260\346\215\256\351\233\206.md" +++ "b/docs/source/Instruction/\346\224\257\346\214\201\347\232\204\346\250\241\345\236\213\345\222\214\346\225\260\346\215\256\351\233\206.md" @@ -929,6 +929,10 @@ |[AI-ModelScope/llava-next-72b](https://modelscope.cn/models/AI-ModelScope/llava-next-72b)|llava_next_qwen|llava_next_qwen|transformers>=4.42, av|✘|vision|[lmms-lab/llava-next-72b](https://huggingface.co/lmms-lab/llava-next-72b)| |[AI-ModelScope/llava-next-110b](https://modelscope.cn/models/AI-ModelScope/llava-next-110b)|llava_next_qwen|llava_next_qwen|transformers>=4.42, av|✘|vision|[lmms-lab/llava-next-110b](https://huggingface.co/lmms-lab/llava-next-110b)| |[AI-ModelScope/llama3-llava-next-8b](https://modelscope.cn/models/AI-ModelScope/llama3-llava-next-8b)|llama3_llava_next|llama3_llava_next|transformers>=4.42, av|✘|vision|[lmms-lab/llama3-llava-next-8b](https://huggingface.co/lmms-lab/llama3-llava-next-8b)| +|[lmms-lab/LLaVA-OneVision-1.5-4B-Instruct](https://modelscope.cn/models/lmms-lab/LLaVA-OneVision-1.5-4B-Instruct)|llava_onevision1_5|llava_onevision1_5|transformers>=4.53, qwen_vl_utils>=0.0.6, decord|✘|vision, video|[lmms-lab/LLaVA-OneVision-1.5-4B-Instruct](https://huggingface.co/lmms-lab/LLaVA-OneVision-1.5-4B-Instruct)| +|[lmms-lab/LLaVA-OneVision-1.5-8B-Instruct](https://modelscope.cn/models/lmms-lab/LLaVA-OneVision-1.5-8B-Instruct)|llava_onevision1_5|llava_onevision1_5|transformers>=4.53, qwen_vl_utils>=0.0.6, decord|✘|vision, video|[lmms-lab/LLaVA-OneVision-1.5-8B-Instruct](https://huggingface.co/lmms-lab/LLaVA-OneVision-1.5-8B-Instruct)| +|[lmms-lab/LLaVA-OneVision-1.5-4B-Base](https://modelscope.cn/models/lmms-lab/LLaVA-OneVision-1.5-4B-Base)|llava_onevision1_5|llava_onevision1_5|transformers>=4.53, qwen_vl_utils>=0.0.6, decord|✘|vision, video|[lmms-lab/LLaVA-OneVision-1.5-4B-Base](https://huggingface.co/lmms-lab/LLaVA-OneVision-1.5-4B-Base)| +|[lmms-lab/LLaVA-OneVision-1.5-8B-Base](https://modelscope.cn/models/lmms-lab/LLaVA-OneVision-1.5-8B-Base)|llava_onevision1_5|llava_onevision1_5|transformers>=4.53, qwen_vl_utils>=0.0.6, decord|✘|vision, video|[lmms-lab/LLaVA-OneVision-1.5-8B-Base](https://huggingface.co/lmms-lab/LLaVA-OneVision-1.5-8B-Base)| |[deepseek-ai/deepseek-vl-1.3b-chat](https://modelscope.cn/models/deepseek-ai/deepseek-vl-1.3b-chat)|deepseek_vl|deepseek_vl|-|✘|vision|[deepseek-ai/deepseek-vl-1.3b-chat](https://huggingface.co/deepseek-ai/deepseek-vl-1.3b-chat)| |[deepseek-ai/deepseek-vl-7b-chat](https://modelscope.cn/models/deepseek-ai/deepseek-vl-7b-chat)|deepseek_vl|deepseek_vl|-|✘|vision|[deepseek-ai/deepseek-vl-7b-chat](https://huggingface.co/deepseek-ai/deepseek-vl-7b-chat)| |[deepseek-ai/deepseek-vl2-tiny](https://modelscope.cn/models/deepseek-ai/deepseek-vl2-tiny)|deepseek_vl2|deepseek_vl2|transformers<4.42|✘|vision|[deepseek-ai/deepseek-vl2-tiny](https://huggingface.co/deepseek-ai/deepseek-vl2-tiny)| diff --git a/docs/source_en/Instruction/Supported-models-and-datasets.md b/docs/source_en/Instruction/Supported-models-and-datasets.md index de7ed3c20e..ced2275ba5 100644 --- a/docs/source_en/Instruction/Supported-models-and-datasets.md +++ b/docs/source_en/Instruction/Supported-models-and-datasets.md @@ -911,6 +911,10 @@ The table below introduces the models integrated with ms-swift: |[llava-hf/llava-v1.6-vicuna-13b-hf](https://modelscope.cn/models/llava-hf/llava-v1.6-vicuna-13b-hf)|llava1_6_vicuna_hf|llava1_6_vicuna_hf|transformers>=4.39|✘|vision|[llava-hf/llava-v1.6-vicuna-13b-hf](https://huggingface.co/llava-hf/llava-v1.6-vicuna-13b-hf)| |[llava-hf/llava-v1.6-34b-hf](https://modelscope.cn/models/llava-hf/llava-v1.6-34b-hf)|llava1_6_yi_hf|llava1_6_yi_hf|transformers>=4.39|✘|vision|[llava-hf/llava-v1.6-34b-hf](https://huggingface.co/llava-hf/llava-v1.6-34b-hf)| |[llava-hf/llama3-llava-next-8b-hf](https://modelscope.cn/models/llava-hf/llama3-llava-next-8b-hf)|llama3_llava_next_hf|llama3_llava_next_hf|transformers>=4.39|✘|vision|[llava-hf/llama3-llava-next-8b-hf](https://huggingface.co/llava-hf/llama3-llava-next-8b-hf)| +|[lmms-lab/LLaVA-OneVision-1.5-4B-Instruct](https://modelscope.cn/models/lmms-lab/LLaVA-OneVision-1.5-4B-Instruct)|llava_onevision1_5|llava_onevision1_5|transformers>=4.53, qwen_vl_utils>=0.0.6, decord|✘|vision, video|[lmms-lab/LLaVA-OneVision-1.5-4B-Instruct](https://huggingface.co/lmms-lab/LLaVA-OneVision-1.5-4B-Instruct)| +|[lmms-lab/LLaVA-OneVision-1.5-8B-Instruct](https://modelscope.cn/models/lmms-lab/LLaVA-OneVision-1.5-8B-Instruct)|llava_onevision1_5|llava_onevision1_5|transformers>=4.53, qwen_vl_utils>=0.0.6, decord|✘|vision, video|[lmms-lab/LLaVA-OneVision-1.5-8B-Instruct](https://huggingface.co/lmms-lab/LLaVA-OneVision-1.5-8B-Instruct)| +|[lmms-lab/LLaVA-OneVision-1.5-4B-Base](https://modelscope.cn/models/lmms-lab/LLaVA-OneVision-1.5-4B-Base)|llava_onevision1_5|llava_onevision1_5|transformers>=4.53, qwen_vl_utils>=0.0.6, decord|✘|vision, video|[lmms-lab/LLaVA-OneVision-1.5-4B-Base](https://huggingface.co/lmms-lab/LLaVA-OneVision-1.5-4B-Base)| +|[lmms-lab/LLaVA-OneVision-1.5-8B-Base](https://modelscope.cn/models/lmms-lab/LLaVA-OneVision-1.5-8B-Base)|llava_onevision1_5|llava_onevision1_5|transformers>=4.53, qwen_vl_utils>=0.0.6, decord|✘|vision, video|[lmms-lab/LLaVA-OneVision-1.5-8B-Base](https://huggingface.co/lmms-lab/LLaVA-OneVision-1.5-8B-Base)| |[llava-hf/llava-next-72b-hf](https://modelscope.cn/models/llava-hf/llava-next-72b-hf)|llava_next_qwen_hf|llava_next_qwen_hf|transformers>=4.39|✘|vision|[llava-hf/llava-next-72b-hf](https://huggingface.co/llava-hf/llava-next-72b-hf)| |[llava-hf/llava-next-110b-hf](https://modelscope.cn/models/llava-hf/llava-next-110b-hf)|llava_next_qwen_hf|llava_next_qwen_hf|transformers>=4.39|✘|vision|[llava-hf/llava-next-110b-hf](https://huggingface.co/llava-hf/llava-next-110b-hf)| |[llava-hf/LLaVA-NeXT-Video-7B-DPO-hf](https://modelscope.cn/models/llava-hf/LLaVA-NeXT-Video-7B-DPO-hf)|llava_next_video_hf|llava_next_video_hf|transformers>=4.42, av|✘|video|[llava-hf/LLaVA-NeXT-Video-7B-DPO-hf](https://huggingface.co/llava-hf/LLaVA-NeXT-Video-7B-DPO-hf)| diff --git a/swift/llm/model/constant.py b/swift/llm/model/constant.py index 725fd5a304..cc4e9ee565 100644 --- a/swift/llm/model/constant.py +++ b/swift/llm/model/constant.py @@ -222,6 +222,7 @@ class MLLMModelType: llava1_6_yi = 'llava1_6_yi' llava_next_qwen = 'llava_next_qwen' llama3_llava_next = 'llama3_llava_next' + llava_onevision1_5 = 'llava_onevision1_5' deepseek_vl = 'deepseek_vl' deepseek_vl2 = 'deepseek_vl2' diff --git a/swift/llm/model/model/llava.py b/swift/llm/model/model/llava.py index 2b698e8897..daf7c68dae 100644 --- a/swift/llm/model/model/llava.py +++ b/swift/llm/model/model/llava.py @@ -5,6 +5,7 @@ from typing import Any, Dict from transformers import AutoConfig +from transformers.dynamic_module_utils import get_class_from_dynamic_module from swift.llm import TemplateType from ..constant import MLLMModelType @@ -389,3 +390,32 @@ def _new_forward(*args, **kwargs): requires=['transformers>=4.42', 'av'], tags=['vision'], model_arch=None)) + + +def get_model_tokenizer_llava_onevision1_5(model_dir, *args, **kwargs): + model_cls = get_class_from_dynamic_module('modeling_llavaonevision1_5.LLaVAOneVision1_5_ForConditionalGeneration', + model_dir) + model_cls._no_split_modules = ['LLaVAOneVision1_5_DecoderLayer', 'RiceBlock'] + model, processor = get_model_tokenizer_multimodal(model_dir, *args, **kwargs) + model.config.vision_start_token_id = 151652 + return model, processor + + +register_model( + ModelMeta( + MLLMModelType.llava_onevision1_5, + [ + ModelGroup([ + Model('lmms-lab/LLaVA-OneVision-1.5-4B-Instruct', 'lmms-lab/LLaVA-OneVision-1.5-4B-Instruct'), + Model('lmms-lab/LLaVA-OneVision-1.5-8B-Instruct', 'lmms-lab/LLaVA-OneVision-1.5-8B-Instruct'), + Model('lmms-lab/LLaVA-OneVision-1.5-4B-Base', 'lmms-lab/LLaVA-OneVision-1.5-4B-Base'), + Model('lmms-lab/LLaVA-OneVision-1.5-8B-Base', 'lmms-lab/LLaVA-OneVision-1.5-8B-Base'), + ], ), + ], + TemplateType.llava_onevision1_5, + get_model_tokenizer_llava_onevision1_5, + architectures=['LLaVAOneVision1_5_ForConditionalGeneration'], + model_arch=ModelArch.llava_onevision1_5, + requires=['transformers>=4.53.0', 'qwen_vl_utils'], + tags=['vision'], + )) diff --git a/swift/llm/model/model_arch.py b/swift/llm/model/model_arch.py index 4942f4922a..9c6150ef7e 100644 --- a/swift/llm/model/model_arch.py +++ b/swift/llm/model/model_arch.py @@ -46,6 +46,7 @@ class MLLMModelArch: llava_hf = 'llava_hf' llava_hf_legacy = 'llava_hf_legacy' # transformers<4.52 llava_next_video_hf = 'llava_next_video_hf' + llava_onevision1_5 = 'llava_onevision1_5' llava_llama = 'llava_llama' llava_mistral = 'llava_mistral' @@ -705,6 +706,14 @@ def register_model_arch(model_arch: ModelKeys, *, exist_ok: bool = False) -> Non language_model='model', )) +register_model_arch( + MultiModelKeys( + MLLMModelArch.llava_onevision1_5, + language_model='model.language_model', + aligner='model.visual.merger', + vision_tower='model.visual', + )) + def get_model_arch(arch_name: Optional[str]) -> Optional[MultiModelKeys]: return MODEL_ARCH_MAPPING.get(arch_name) diff --git a/swift/llm/template/constant.py b/swift/llm/template/constant.py index 80bfddc170..52560d88ed 100644 --- a/swift/llm/template/constant.py +++ b/swift/llm/template/constant.py @@ -152,6 +152,7 @@ class MLLMTemplateType: llava1_6_yi = 'llava1_6_yi' llava_next_qwen = 'llava_next_qwen' llama3_llava_next = 'llama3_llava_next' + llava_onevision1_5 = 'llava_onevision1_5' yi_vl = 'yi_vl' diff --git a/swift/llm/template/template/llava.py b/swift/llm/template/template/llava.py index 4f8a04255a..22600d9dd5 100644 --- a/swift/llm/template/template/llava.py +++ b/swift/llm/template/template/llava.py @@ -6,6 +6,7 @@ import transformers from packaging import version +from swift.utils import get_env_args from ..base import Template from ..constant import MLLMTemplateType from ..register import TemplateMeta, register_template @@ -307,3 +308,101 @@ def _data_collator(self, batch: List[Dict[str, Any]], *, padding_to: Optional[in )) register_template(QwenTemplateMeta(MLLMTemplateType.llava_next_qwen, template_cls=LLavaTemplate)) + + +class LLavaOneVision1_5Template(Template): + image_token_id = 151655 + video_token_id = 151656 + placeholder_tokens = ['<|image_pad|>', '<|video_pad|>'] + use_model = True + support_padding_free = True + + def init_env_args(self): + super().init_env_args() + self.bbox_format = get_env_args('QWENVL_BBOX_FORMAT', str, 'legacy') + + def replace_tag(self, media_type: Literal['image', 'video', 'audio'], index: int, + inputs: StdTemplateInputs) -> List[Context]: + from qwen_vl_utils import fetch_image, fetch_video + assert media_type in {'image', 'video'} + if media_type == 'image': + inputs.images[index] = fetch_image({'image': inputs.images[index]}) + if self.mode == 'lmdeploy': + return ['<|vision_start|>', [-100], '<|vision_end|>'] + else: + return ['<|vision_start|><|image_pad|><|vision_end|>'] + else: + video = inputs.videos[index] + video, video_kwargs = fetch_video({'video': video}, return_video_sample_fps=True) + inputs.mm_processor_kwargs.setdefault('fps', []).append(video_kwargs) + tokens = ['<|vision_start|><|video_pad|><|vision_end|>'] + if isinstance(video, torch.Tensor): + video = video.to(torch.uint8) + inputs.videos[index] = video + return tokens + + def replace_ref(self, ref: str, index: int, inputs: StdTemplateInputs) -> List[Context]: + if self.bbox_format == 'legacy': + return [f'<|object_ref_start|>{ref}<|object_ref_end|>'] + else: + return [ref] + + def replace_bbox(self, bbox: List[int], index: int, inputs: StdTemplateInputs) -> List[Context]: + if self.bbox_format == 'legacy': + return [f'<|box_start|>{self._get_bbox_str(bbox)}<|box_end|>'] + else: + return [str(bbox)] + + def _encode(self, inputs: StdTemplateInputs) -> Dict[str, Any]: + encoded = super()._encode(inputs) + processor = self.processor + input_ids = encoded['input_ids'] + labels = encoded['labels'] + loss_scale = encoded.get('loss_scale', None) + for media_type in ['images', 'videos']: + mm_data = getattr(inputs, media_type) + if mm_data: + if media_type == 'images': + media_token = self.image_token_id + media_inputs = processor.image_processor(images=mm_data, return_tensors='pt', do_resize=False) + media_grid_thw = media_inputs['image_grid_thw'] + else: + kwargs = {} + if hasattr(processor, 'video_processor'): + processor_func = processor.video_processor + else: + processor_func = processor.image_processor + kwargs['images'] = None + media_inputs = processor_func(videos=mm_data, return_tensors='pt', do_resize=False, **kwargs) + media_grid_thw = media_inputs['video_grid_thw'] + media_token = self.video_token_id + idx_list = findall(input_ids, media_token) + merge_length = processor.image_processor.merge_size**2 + + def _get_new_tokens(i): + token_len = (media_grid_thw[i].prod() // merge_length) + return [media_token] * token_len + + input_ids, labels, loss_scale = self._extend_tokens(input_ids, labels, loss_scale, idx_list, + _get_new_tokens) + encoded.update(media_inputs) + + encoded['input_ids'] = input_ids + encoded['labels'] = labels + encoded['loss_scale'] = loss_scale + return encoded + + def _post_encode(self, model, inputs: Dict[str, Any]) -> Dict[str, Any]: + if not self.is_training: + return inputs + input_ids = inputs['input_ids'] + base_model = self.get_base_model(model) + if hasattr(base_model.model, 'embed_tokens'): + inputs_embeds = base_model.model.embed_tokens(input_ids) + else: + inputs_embeds = base_model.model.language_model.embed_tokens(input_ids) + inputs_embeds = self._get_inputs_embeds_hf(inputs_embeds, inputs, model.visual, self.processor, model.config) + return {'inputs_embeds': inputs_embeds} + + +register_template(QwenTemplateMeta(MLLMTemplateType.llava_onevision1_5, template_cls=LLavaOneVision1_5Template)) diff --git a/tests/test_align/test_template/test_vision.py b/tests/test_align/test_template/test_vision.py index 9e65c9473b..eb7af35e00 100644 --- a/tests/test_align/test_template/test_vision.py +++ b/tests/test_align/test_template/test_vision.py @@ -980,6 +980,17 @@ def test_deepseek_ocr(): '创空间 中体验SWIFT web-ui功能了。') +def test_llava_onevision1_5(): + pt_engine = PtEngine('lmms-lab/LLaVA-OneVision-1.5-4B-Instruct') + query = 'Describe this image.' + messages = [{'role': 'user', 'content': query}] + images = ['https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen-VL/assets/demo.jpeg'] + response = _infer_model(pt_engine, messages=messages, images=images) + pt_engine.default_template.template_backend = 'jinja' + response2 = _infer_model(pt_engine, messages=messages, images=images) + assert response == response2 + + if __name__ == '__main__': from swift.llm import PtEngine, RequestConfig from swift.utils import get_logger, seed_everything @@ -1051,4 +1062,5 @@ def test_deepseek_ocr(): # test_internvl3_5_hf() # test_internvl_gpt_hf() # test_sailvl2() - test_deepseek_ocr() + # test_deepseek_ocr() + test_llava_onevision1_5()