[FIX] update flash-attention for ViT to sync with official repo (#793)

XHPlus · web-flow · commit 3626756c998c · 2025-04-04T00:08:48.000+08:00
Update flash-attention for ViT to sync with the official repo's [update](Dao-AILab/flash-attention@7ae5f8c#diff-e3790eb114f13873b06146deb854ad12d785a3997ad79b6cf4dd9485419eb632R39)
diff --git a/Dockerfile b/Dockerfile
@@ -40,5 +40,8 @@ RUN pip install -r /lightllm/requirements.txt --no-cache-dir --ignore-installed
 
 RUN pip install --no-cache-dir nvidia-nccl-cu12==2.25.1  # for allreduce hang issues in multinode H100
 
+RUN git clone https://github.com/Dao-AILab/flash-attention.git -b v2.7.4.post1
+RUN cd flash-attention/hopper && NVCC_THREADS=128 python setup.py install
+
 COPY . /lightllm
 RUN pip install -e /lightllm --no-cache-dir
diff --git a/lightllm/models/internvl/model.py b/lightllm/models/internvl/model.py
@@ -4,6 +4,7 @@
 from lightllm.models.llama.model import LlamaTpPartModel
 from lightllm.models.phi3.model import Phi3TpPartModel
 from lightllm.models.qwen2.model import Qwen2TpPartModel
+from lightllm.models.deepseek2.model import Deepseek2TpPartModel
 from lightllm.models.qwen_vl.layer_infer.pre_layer_infer import LlamaMultimodalPreLayerInfer
 from lightllm.server.multimodal_params import MultimodalParams, ImageItem
 from lightllm.common.build_utils import repair_config
@@ -26,10 +27,10 @@
 IMG_END_TOKEN = "</img>"
 IMG_TOKEN = "<image>"
 
+
 # Warp of the origal tokenizer
 class InternvlTokenizer:
     def __init__(self, tokenizer, model_cfg, **kwargs):
-
         self.llm_model_type = model_cfg.get("llm_config").get("model_type")
         self.tokenizer = tokenizer
         self.image_length = int(os.environ.get("INTERNVL_IMAGE_LENGTH", 256))
@@ -200,3 +201,27 @@ def _init_config(self):
         if self.finetune_config:
             self.config["vocab_size"] = self.finetune_config.vocab_size
         return
+
+
+class InternVLDeepSeek2TpPartModel(Deepseek2TpPartModel):
+    # support Deepseek2,3,R1
+    # weight class
+    pre_and_post_weight_class = InternVLLlamaPreAndPostLayerWeight
+
+    # infer class
+    pre_layer_infer_class = LlamaMultimodalPreLayerInfer
+
+    def __init__(self, kvargs):
+        super().__init__(kvargs)
+        return
+
+    def _init_config(self):
+        with open(os.path.join(self.weight_dir_, "config.json"), "r") as json_file:
+            self.config = json.load(json_file)["llm_config"]
+        # rename keys
+        repair_config(self.config, same_names=["num_attention_heads", "n_head"])
+        repair_config(self.config, same_names=["hidden_size", "n_embd", "n_embed"])
+        repair_config(self.config, same_names=["num_hidden_layers", "n_layer"])
+        if self.finetune_config:
+            self.config["vocab_size"] = self.finetune_config.vocab_size
+        return
diff --git a/lightllm/models/vit/triton_kernel/flashattention_nopad.py b/lightllm/models/vit/triton_kernel/flashattention_nopad.py
@@ -192,6 +192,7 @@ def flash_attention_v3_fwd(
             None,
             None,
             None,
+            None,
             softmax_scale,
             causal=False,
             window_size=(-1, -1),
diff --git a/lightllm/server/router/model_infer/mode_backend/base_backend.py b/lightllm/server/router/model_infer/mode_backend/base_backend.py
@@ -27,7 +27,12 @@
 from lightllm.models.gemma_2b.model import Gemma_2bTpPartModel
 from lightllm.models.phi3.model import Phi3TpPartModel
 from lightllm.models.deepseek2.model import Deepseek2TpPartModel
-from lightllm.models.internvl.model import InternVLLlamaTpPartModel, InternVLPhi3TpPartModel, InternVLQwen2TpPartModel
+from lightllm.models.internvl.model import (
+    InternVLLlamaTpPartModel,
+    InternVLPhi3TpPartModel,
+    InternVLQwen2TpPartModel,
+    InternVLDeepSeek2TpPartModel,
+)
 from lightllm.models.internvl.model import InternVLInternlm2TpPartModel
 from lightllm.models.qwen2_vl.model import Qwen2VLTpPartModel
 from lightllm.models.qwen2_reward.model import Qwen2RewardTpPartModel
@@ -199,6 +204,8 @@ def init_model(self, kvargs):
                     self.model = InternVLLlamaTpPartModel(model_kvargs)
                 elif llm_model_type == "qwen2":
                     self.model = InternVLQwen2TpPartModel(model_kvargs)
+                elif llm_model_type == "deepseek_v3":
+                    self.model = InternVLDeepSeek2TpPartModel(model_kvargs)
                 self.is_multimodal = True
             else:
                 raise Exception(f"can not support {self.model_type} now")