update get_model_tokenizer_with_flash_attn (#3337)

Jintao-Huang · web-flow · commit a2459b817ea8 · 2025-03-02T20:49:13.000+08:00
diff --git a/docs/source/Customization/自定义数据集.md b/docs/source/Customization/自定义数据集.md
@@ -121,7 +121,7 @@ label代表两个句子的相似度, loss使用`cosine_similarity`
 {"messages": [{"role": "system", "content": "你是个有用无害的助手"}, {"role": "user", "content": "<image>图片中是什么，<video>视频中是什么"}, {"role": "assistant", "content": "图片中是一个大象，视频中是一只小狗在草地上奔跑"}], "images": ["/xxx/x.jpg"], "videos": ["/xxx/x.mp4"]}
 ```
 
-多模态模型的RLHF和序列分类的数据格式可以参考纯文本大模型的格式。
+多模态模型的RLHF和序列分类的数据格式可以参考纯文本大模型的格式，并在此基础上增加`images`等字段。
 
 #### grounding
 
diff --git a/docs/source_en/Customization/Custom-dataset.md b/docs/source_en/Customization/Custom-dataset.md
@@ -123,8 +123,7 @@ Supervised Fine-tuning:
 {"messages": [{"role": "system", "content": "You are a helpful and harmless assistant."}, {"role": "user", "content": "<image>What is in the image, <video>What is in the video?"}, {"role": "assistant", "content": "The image shows an elephant, and the video shows a puppy running on the grass."}], "images": ["/xxx/x.jpg"], "videos": ["/xxx/x.mp4"]}
 ```
 
-The data formats for RLHF and sequence classification in multimodal models can refer to the formats used in pure text large models.
-
+The data format for RLHF and sequence classification of multimodal models can reference the format of pure text large models, with additional fields such as `images` added on top of that.
 
 #### Grounding
 
diff --git a/swift/llm/model/model/qwen.py b/swift/llm/model/model/qwen.py
@@ -650,6 +650,7 @@ def get_model_tokenizer_qwen2_audio(*args, **kwargs):
 
 
 def get_model_tokenizer_ovis(*args, **kwargs):
+    kwargs['attn_impl_keys'] = ['llm_attn_implementation']
     model, tokenizer = get_model_tokenizer_with_flash_attn(*args, **kwargs)
     model.visual_tokenizer.to(model.dtype)
     model.vte.to(model.dtype)
diff --git a/swift/llm/model/register.py b/swift/llm/model/register.py
@@ -237,7 +237,7 @@ def get_model_tokenizer_with_flash_attn(model_dir: str,
     model_config = kwargs.get('model_config')
     if model_config is None:
         model_config = AutoConfig.from_pretrained(model_dir, trust_remote_code=True)
-    AttnImpl.update_attn_impl(model_config, kwargs.get('attn_impl'))
+    AttnImpl.update_attn_impl(model_config, kwargs.get('attn_impl'), kwargs.get('attn_impl_keys'))
     kwargs['model_config'] = model_config
     return get_model_tokenizer_from_local(model_dir, model_info, model_kwargs, load_model, **kwargs)
 
diff --git a/swift/llm/model/utils.py b/swift/llm/model/utils.py
@@ -6,10 +6,8 @@
 from typing import Any, Dict, List, Literal, Optional, Tuple, TypeVar, Union
 
 import torch
-import transformers
 from accelerate.utils import find_device
 from modelscope.hub.utils.utils import get_cache_dir
-from packaging import version
 from transformers import PretrainedConfig
 
 from swift.hub import get_hub
@@ -26,25 +24,31 @@ class AttnImpl:
     sdpa = 'sdpa'
     eager = 'eager'
 
+    attn_impl_keys = ['_attn_implementation', 'attn_implementation', 'llm_attn_implementation']
+    use_flash_attn_keys = ['_flash_attn_2_enabled', 'use_flash_attn', '_use_flash_attention_2']
+
     @staticmethod
     def to_use_flash_attn(attn_impl: Optional[str], auto_value: _T = None) -> Union[bool, _T]:
         if attn_impl is None:
             return auto_value
         return attn_impl == AttnImpl.flash_attn
 
     @staticmethod
-    def update_attn_impl(config: PretrainedConfig, attn_impl: Optional[str], auto_value: _T = None) -> None:
-
-        use_flash_attn = AttnImpl.to_use_flash_attn(attn_impl, auto_value)
-        if use_flash_attn is None:
+    def update_attn_impl(config: PretrainedConfig,
+                         attn_impl: Optional[str],
+                         attn_impl_keys: Optional[List[str]] = None) -> None:
+        if attn_impl is None:
             return
-        from swift.llm import HfConfigFactory
-        if version.parse(transformers.__version__) >= version.parse('4.36'):
-            if use_flash_attn:
-                attn_impl = 'flash_attention_2'
-            HfConfigFactory.set_config_attr(config, '_attn_implementation', attn_impl)
-        else:
-            HfConfigFactory.set_config_attr(config, '_flash_attn_2_enabled', use_flash_attn)
+        use_flash_attn = AttnImpl.to_use_flash_attn(attn_impl)
+        if use_flash_attn:
+            attn_impl = 'flash_attention_2'
+        if isinstance(attn_impl_keys, str):
+            attn_impl_keys = [attn_impl_keys]
+        attn_impl_keys = attn_impl_keys or AttnImpl.attn_impl_keys
+        for key in attn_impl_keys:
+            HfConfigFactory.set_config_attr(config, key, attn_impl, ensure_set=False)
+        for key in AttnImpl.use_flash_attn_keys:
+            HfConfigFactory.set_config_attr(config, key, use_flash_attn, ensure_set=False)
 
 
 @dataclass
@@ -109,16 +113,20 @@ def get_config_attr(config: Union[PretrainedConfig, Dict[str, Any]], attr_name:
             return attrs[0][1]
 
     @staticmethod
-    def set_config_attr(config: Union[PretrainedConfig, Dict[str, Any]], attr_name: str, value: Any) -> None:
+    def set_config_attr(config: Union[PretrainedConfig, Dict[str, Any]],
+                        attr_name: str,
+                        value: Any,
+                        ensure_set: bool = True) -> int:
         """Set all the attr_name attributes to value."""
         attrs = HfConfigFactory._get_config_attrs(config, attr_name)
-        if len(attrs) == 0:
+        if ensure_set and len(attrs) == 0:
             attrs.append((config, None))
         for config, _ in attrs:
             if isinstance(config, dict):
                 config[attr_name] = value
             else:
                 setattr(config, attr_name, value)
+        return len(attrs)
 
     @staticmethod
     def set_model_config_attr(model, attr_name: str, value: Any) -> None:
diff --git a/tests/models/test_flash_attn.py b/tests/models/test_flash_attn.py
@@ -0,0 +1,8 @@
+from swift.llm import get_model_tokenizer
+
+if __name__ == '__main__':
+    # model, tokenizer = get_model_tokenizer('Qwen/Qwen2-7B-Instruct', attn_impl='flash_attn')
+    # model, tokenizer = get_model_tokenizer('AIDC-AI/Ovis2-2B', attn_impl='flash_attn')
+    # model, tokenizer = get_model_tokenizer('OpenGVLab/InternVL2-2B', attn_impl='flash_attn')
+    model, tokenizer = get_model_tokenizer('Shanghai_AI_Laboratory/internlm3-8b-instruct', attn_impl='flash_attn')
+    print(model)
diff --git a/tests/test_align/test_vllm_vlm.py b/tests/test_align/test_vllm_vlm.py
@@ -31,6 +31,20 @@ def _infer_image(model, use_chat_template: bool = True, max_model_len=8192, syst
     return resp_list[0].choices[0].message.content
 
 
+def _infer_video(model, use_chat_template: bool = True, max_model_len=8192, system=None):
+    engine = VllmEngine(model, max_model_len=max_model_len, limit_mm_per_prompt={'image': 16, 'video': 2})
+    if not use_chat_template:
+        engine.default_template.use_chat_template = False
+    videos = ['https://modelscope-open.oss-cn-hangzhou.aliyuncs.com/images/baby.mp4']
+    messages = []
+    if system is not None:
+        messages += [{'role': 'system', 'content': system}]
+    messages.append({'role': 'user', 'content': 'describe the video.'})
+    resp_list = engine.infer([InferRequest(messages=messages, videos=videos)],
+                             RequestConfig(temperature=0, max_tokens=64, repetition_penalty=1.))
+    return resp_list[0].choices[0].message.content
+
+
 def test_qwen2_audio():
     response = _infer_audio('Qwen/Qwen2-Audio-7B-Instruct')
     assert response == "The audio is a man speaking in Mandarin saying '今天天气真好呀'."
@@ -68,10 +82,48 @@ def test_internvl2():
                         'and it appears to be looking directly at the camera. The fur is soft and fluffy, with a mix')
 
 
+def test_minicpmv_2_5():
+    response = _infer_image('OpenBMB/MiniCPM-Llama3-V-2_5', max_model_len=4096)
+    assert response == (
+        "The image is a digital painting of a kitten that captures the essence of a young feline's innocence "
+        "and curiosity. The kitten's fur is rendered with a mix of gray, white, and black stripes, "
+        'giving it a realistic and adorable appearance. Its large, expressive eyes are a striking blue, '
+        "which draws the viewer's")
+
+
+def test_minicpmv_2_6():
+    response = _infer_image('OpenBMB/MiniCPM-V-2_6', max_model_len=4096)
+    assert response == (
+        'The image features a close-up of a kitten with striking blue eyes and a mix of '
+        "white and dark fur, possibly gray or black. The kitten's gaze is directed forward, giving it an "
+        "expressive and captivating look. The background is blurred, drawing focus to the kitten's face. "
+        "The overall composition emphasizes the kitten's features")
+
+
+def test_minicpmo_2_6_video():
+    response = _infer_video('OpenBMB/MiniCPM-o-2_6')
+    assert response == ('The video features a young child sitting on a bed, deeply engaged in reading a book. '
+                        'The child, dressed in a light blue sleeveless top and pink pants, is surrounded by a '
+                        'cozy and homely environment. The bed is adorned with a patterned blanket, and a white cloth '
+                        'is casually draped over the side.')
+
+
+def test_qwen2_5_vl_video():
+    response = _infer_video('Qwen/Qwen2.5-VL-3B-Instruct')
+    assert response == ('A baby wearing sunglasses is sitting on a bed and reading a book. '
+                        'The baby is holding the book with both hands and is looking at the pages. '
+                        'The baby is wearing a light blue shirt and pink pants. The baby is sitting '
+                        'on a white blanket. The baby is looking at the book and is smiling. The baby')
+
+
 if __name__ == '__main__':
     from swift.llm import VllmEngine, InferRequest, RequestConfig
     # test_qwen2_vl()
     # test_qwen2_5_vl()
     # test_deepseek_vl_v2()
     # test_internvl2()
-    test_qwen2_audio()
+    # test_qwen2_audio()
+    # test_minicpmv_2_5()
+    # test_minicpmv_2_6()
+    test_minicpmo_2_6_video()
+    # test_qwen2_5_vl_video()