Merge remote-tracking branch 'origin/main' into pd_master

hiworldwzj · hiworldwzj · commit f3f29e662b40 · 2025-04-16T09:36:54.000+08:00
diff --git a/.github/workflows/docker-publish.yml b/.github/workflows/docker-publish.yml
@@ -33,6 +33,12 @@ jobs:
       - name: Checkout repository
         uses: actions/checkout@v3
 
+      - name: Set up swap space
+        if: runner.os == 'Linux'
+        uses: pierotofy/set-swap-space@v1.0
+        with:
+          swap-size-gb: 10
+
       # clean cache image
       - name: Clean up Docker space
         run: |
diff --git a/lightllm/models/llama/model.py b/lightllm/models/llama/model.py
@@ -88,6 +88,11 @@ def _init_custom(self):
             and self.config.get("rope_scaling", {}).get("rope_type", "base") == "llama3"
         ):
             self._init_to_get_llama3_rotary()
+        elif (
+            self.config.get("rope_scaling", None) is not None
+            and self.config.get("rope_scaling", {}).get("type", "base") == "mrope"
+        ):
+            self._init_to_get_mrope_rotary()
         else:
             self._init_to_get_rotary()
         return
@@ -332,3 +337,47 @@ def _init_to_get_llama3_rotary(self, default_base=10000):
         self._cos_cached = torch.cos(freqs).to(self.data_type).cuda()
         self._sin_cached = torch.sin(freqs).to(self.data_type).cuda()
         return
+
+    def _init_to_get_mrope_rotary(self, default_base=10000):
+        partial_head_dim = int(self.config.get("partial_rotary_factor", 1) * self.head_dim_)
+        if self.config.get("rope_scaling", {}) is None:
+            rope_scaling_factor = 1.0
+        else:
+            rope_scaling_factor = self.config.get("rope_scaling", {}).get("factor", 1.0)
+
+        base = self.config.get("rope_theta", float(default_base))
+
+        if "max_sequence_length" in self.config:
+            max_seq_len = self.config["max_sequence_length"]
+        else:
+            max_position_embeddings = self.config.get(
+                "max_position_embeddings", 2048 if base <= 10000.0 + 1e-5 else 16384
+            )
+            max_seq_len = max_position_embeddings * rope_scaling_factor
+
+        # NTK
+        try:
+            ntk_alpha = float(os.environ.get("LIGHTLLM_NTK_ALPHA", 1))
+            assert ntk_alpha >= 1
+            if ntk_alpha > 1:
+                logger.info(f"Note: NTK enabled, alpha set to {ntk_alpha}")
+            max_seq_len *= ntk_alpha
+            base = base * (ntk_alpha ** (partial_head_dim / (partial_head_dim - 2)))  # Base change formula
+        except:
+            pass
+
+        inv_freq = 1.0 / (
+            base ** (torch.arange(0, partial_head_dim, 2, device="cpu", dtype=torch.float32) / partial_head_dim)
+        )
+
+        t = (
+            torch.arange(max(max_seq_len + 1024 * 128, self.max_seq_length), device="cpu", dtype=torch.float32)
+            / rope_scaling_factor
+        )
+        freqs = torch.outer(t, inv_freq).unsqueeze(0).expand(3, -1, -1)
+        freqs = torch.cat((freqs, freqs), dim=-1)
+
+        self._cos_cached = torch.cos(freqs).to(self.data_type).cuda()
+        self._sin_cached = torch.sin(freqs).to(self.data_type).cuda()
+
+        return
diff --git a/lightllm/models/qwen2_vl/infer_struct.py b/lightllm/models/qwen2_vl/infer_struct.py
@@ -0,0 +1,26 @@
+import torch
+import numpy as np
+from lightllm.models.llama.infer_struct import LlamaInferStateInfo
+
+
+class Qwen2VLInferStateInfo(LlamaInferStateInfo):
+    def __init__(self):
+        super().__init__()
+        self.position_cos = None
+        self.position_sin = None
+
+    def init_some_extra_state(self, model, input_ids: torch.Tensor):
+        if self.is_prefill:
+            b_seq_len_numpy = self.b_seq_len.cpu().numpy()
+            self.max_seq_len = b_seq_len_numpy.max()
+            position_ids = torch.from_numpy(
+                np.concatenate([np.arange(0, b_seq_len_numpy[i]) for i in range(len(b_seq_len_numpy))])
+            ).cuda()
+            self.position_sin = model._sin_cached[:, position_ids, :].unsqueeze(1)
+            self.position_cos = model._cos_cached[:, position_ids, :].unsqueeze(1)
+            position_ids = None
+        else:
+            position_ids = self.b_seq_len - 1
+            self.position_sin = model._sin_cached[:, position_ids, :].unsqueeze(1)
+            self.position_cos = model._cos_cached[:, position_ids, :].unsqueeze(1)
+        return
diff --git a/lightllm/models/qwen2_vl/layer_infer/transformer_layer_infer.py b/lightllm/models/qwen2_vl/layer_infer/transformer_layer_infer.py
@@ -0,0 +1,46 @@
+import torch
+import torch.functional as F
+import torch.distributed as dist
+import numpy as np
+
+from lightllm.models.llama.layer_infer.transformer_layer_infer import LlamaTransformerLayerInfer
+from functools import partial
+
+
+def rotate_half(x):
+    x1 = x[..., : x.shape[-1] // 2]
+    x2 = x[..., x.shape[-1] // 2 :]
+    return torch.cat((-x2, x1), dim=-1)
+
+
+def apply_multimodal_rotary_pos_emb(q, k, cos, sin, mrope_section, unsqueeze_dim=1):
+    mrope_section = mrope_section * 2
+    cos = torch.cat([m[i % 3] for i, m in enumerate(cos.split(mrope_section, dim=-1))], dim=-1).unsqueeze(unsqueeze_dim)
+    sin = torch.cat([m[i % 3] for i, m in enumerate(sin.split(mrope_section, dim=-1))], dim=-1).unsqueeze(unsqueeze_dim)
+
+    q_embed = (q * cos) + (rotate_half(q) * sin)
+    k_embed = (k * cos) + (rotate_half(k) * sin)
+
+    return q_embed, k_embed
+
+
+class Qwen2VLTransformerLayerInfer(LlamaTransformerLayerInfer):
+    def __init__(self, layer_num, network_config, mode=[]):
+        super().__init__(layer_num, network_config, mode)
+        self.mrope_section = network_config["rope_scaling"]["mrope_section"]
+
+    def _get_qkv(self, input, cache_kv, infer_state, layer_weight):
+        q = layer_weight.q_proj.mm(input)
+        cache_kv = layer_weight.kv_proj.mm(
+            input, out=cache_kv.view(-1, (self.tp_k_head_num_ + self.tp_v_head_num_) * self.head_dim_)
+        ).view(-1, (self.tp_k_head_num_ + self.tp_v_head_num_), self.head_dim_)
+        seq_len, _ = q.shape
+        q = q.view(1, seq_len, -1, self.head_dim_).transpose(1, 2)
+        k = cache_kv[:, : self.tp_k_head_num_, :].view(1, seq_len, -1, self.head_dim_).transpose(1, 2)
+        new_q, new_k = apply_multimodal_rotary_pos_emb(
+            q, k, infer_state.position_cos, infer_state.position_sin, self.mrope_section
+        )
+        new_q = new_q.transpose(1, 2).reshape(1, seq_len, -1)
+        cache_kv[:, : self.tp_k_head_num_, :] = new_k.squeeze(0).permute(1, 0, 2)
+
+        return new_q, cache_kv
diff --git a/lightllm/models/qwen2_vl/model.py b/lightllm/models/qwen2_vl/model.py
@@ -12,17 +12,16 @@
 from typing import List, Optional, Union
 from transformers.utils import TensorType, logging
 from lightllm.common.build_utils import repair_config
+from lightllm.models.qwen2_vl.infer_struct import Qwen2VLInferStateInfo
+from lightllm.models.qwen2_vl.layer_infer.transformer_layer_infer import Qwen2VLTransformerLayerInfer
 
-# from lightllm.models.qwen2_vl.vision_process import Qwen2VLImageProcessor
 import torch
 from PIL import Image
 from .vision_process import smart_resize
 from lightllm.models.qwen2.layer_weights import transformer_layer_weight, pre_and_post_layer_weight
 from lightllm.models.qwen2.model import Qwen2TpPartModel
 import os
 
-# from lightllm.models.qwen2_vl.layer_weight.pre_and_post_layer_weight import Qwen2VLPreAndPostLayerWeight
-
 # Warp of the origal tokenizer
 class QWen2VLTokenizer:
     def __init__(self, tokenizer=None, image_processor=None, **kwargs):
@@ -89,10 +88,10 @@ def __getattr__(self, name):
 
 class Qwen2VLTpPartModel(Qwen2TpPartModel):
 
-    # weight class
-    # pre_and_post_weight_class = Qwen2VLPreAndPostLayerWeight
-    # infer class
     pre_layer_infer_class = LlamaMultimodalPreLayerInfer
+    transformer_layer_infer_class = Qwen2VLTransformerLayerInfer
+
+    infer_state_class = Qwen2VLInferStateInfo
 
     def __init__(self, kvargs):
         super().__init__(kvargs)
diff --git a/lightllm/server/httpserver/manager.py b/lightllm/server/httpserver/manager.py
@@ -25,10 +25,12 @@
 from lightllm.server.core.objs.io_objs import GroupReqObjs
 from fastapi import Request
 from lightllm.server.core.objs.shm_req_manager import ShmReqManager
+from lightllm.server.router.dynamic_prompt.shared_arr import SharedInt
 from lightllm.utils.log_utils import init_logger
 from lightllm.server.metrics.manager import MetricClient
 from lightllm.utils.statics_utils import MovingAverage
 from lightllm.utils.config_utils import get_vocab_size
+from lightllm.utils.envs_utils import get_unique_server_name
 
 logger = init_logger(__name__)
 
@@ -103,6 +105,10 @@ def __init__(
         # 有的模型的vocab size 读取tokenizer和config.json中不一致
         self.vocab_size = max(get_vocab_size(args.model_dir), self.tokenizer.vocab_size)
 
+        # The timemark of the latest inference(prefill/decode) which is used to check the health status of the system.
+        # If the timemark is not updated for a pre-set time, a prob request will be sent to the backend.
+        self.latest_success_infer_time_mark = SharedInt(f"{get_unique_server_name()}_latest_success_infer_time_mark")
+        self.latest_success_infer_time_mark.set_value(int(time.time()))
         return
 
     # connect cache server, calculate md5, alloc resource, return uuid
@@ -483,6 +489,9 @@ async def _wait_to_token_package(
 
                     out_token_counter += 1
 
+                    # update inference timemark
+                    self.latest_success_infer_time_mark.set_value(int(time.time()))
+
                     yield sub_req_id, out_str, metadata, finish_status
                     # 如果有子请求完成，就更新计数
                     if finish_status.is_finished():
diff --git a/lightllm/utils/health_check.py b/lightllm/utils/health_check.py
@@ -1,13 +1,16 @@
 import os
+import time
 import asyncio
 import numpy as np
 from dataclasses import dataclass
 from lightllm.server.core.objs import SamplingParams
 from lightllm.server.multimodal_params import MultimodalParams
 from lightllm.server.httpserver.manager import HttpServerManager
+from lightllm.server.router.dynamic_prompt.shared_arr import SharedInt
 from fastapi import Request
 from lightllm.server.req_id_generator import ReqIDGenerator
 from lightllm.utils.log_utils import init_logger
+from lightllm.utils.envs_utils import get_unique_server_name
 
 logger = init_logger(__name__)
 
@@ -24,6 +27,7 @@ class HealthObj:
     _failure_threshold: int = int(os.getenv("HEALTH_FAILURE_THRESHOLD", 3))
     timeout: int = int(os.getenv("HEALTH_TIMEOUT", 100))
     dynamic_timeout: int = int(os.getenv("HEALTH_TIMEOUT", 100))
+    latest_success_infer_time_mark = SharedInt(f"{get_unique_server_name()}_latest_success_infer_time_mark")
 
     def begin_check(self):
         self._is_health_checking = True
@@ -48,13 +52,22 @@ def is_health(self):
     def is_checking(self):
         return self._is_health_checking
 
+    def has_latest_inference(self):
+        last_timemark = self.latest_success_infer_time_mark.get_value()
+        time_diff = time.time() - last_timemark
+        return time_diff < self.timeout
+
 
 health_obj = HealthObj()
 
 
 async def health_check(args, httpserver_manager: HttpServerManager, request: Request):
     if health_obj.is_checking():
         return health_obj.is_health()
+
+    if health_obj.is_health() and health_obj.has_latest_inference():
+        return health_obj.is_health()
+
     health_obj.begin_check()
     try:
         request_dict = {"inputs": "你好！", "parameters": {"do_sample": True, "temperature": 0.8, "max_new_tokens": 2}}