Merge branch 'main' into tokens_num

sangchengmeng · sangchengmeng · commit 8c3e38d6a214 · 2025-04-07T15:34:36.000+08:00
diff --git a/Dockerfile b/Dockerfile
@@ -40,5 +40,8 @@ RUN pip install -r /lightllm/requirements.txt --no-cache-dir --ignore-installed
 
 RUN pip install --no-cache-dir nvidia-nccl-cu12==2.25.1  # for allreduce hang issues in multinode H100
 
+RUN git clone https://github.com/Dao-AILab/flash-attention.git -b v2.7.4.post1
+RUN cd flash-attention/hopper && NVCC_THREADS=128 python setup.py install
+
 COPY . /lightllm
 RUN pip install -e /lightllm --no-cache-dir
diff --git a/lightllm/models/internvl/model.py b/lightllm/models/internvl/model.py
@@ -4,6 +4,7 @@
 from lightllm.models.llama.model import LlamaTpPartModel
 from lightllm.models.phi3.model import Phi3TpPartModel
 from lightllm.models.qwen2.model import Qwen2TpPartModel
+from lightllm.models.deepseek2.model import Deepseek2TpPartModel
 from lightllm.models.qwen_vl.layer_infer.pre_layer_infer import LlamaMultimodalPreLayerInfer
 from lightllm.server.multimodal_params import MultimodalParams, ImageItem
 from lightllm.common.build_utils import repair_config
@@ -26,10 +27,10 @@
 IMG_END_TOKEN = "</img>"
 IMG_TOKEN = "<image>"
 
+
 # Warp of the origal tokenizer
 class InternvlTokenizer:
     def __init__(self, tokenizer, model_cfg, **kwargs):
-
         self.llm_model_type = model_cfg.get("llm_config").get("model_type")
         self.tokenizer = tokenizer
         self.image_length = int(os.environ.get("INTERNVL_IMAGE_LENGTH", 256))
@@ -200,3 +201,27 @@ def _init_config(self):
         if self.finetune_config:
             self.config["vocab_size"] = self.finetune_config.vocab_size
         return
+
+
+class InternVLDeepSeek2TpPartModel(Deepseek2TpPartModel):
+    # support Deepseek2,3,R1
+    # weight class
+    pre_and_post_weight_class = InternVLLlamaPreAndPostLayerWeight
+
+    # infer class
+    pre_layer_infer_class = LlamaMultimodalPreLayerInfer
+
+    def __init__(self, kvargs):
+        super().__init__(kvargs)
+        return
+
+    def _init_config(self):
+        with open(os.path.join(self.weight_dir_, "config.json"), "r") as json_file:
+            self.config = json.load(json_file)["llm_config"]
+        # rename keys
+        repair_config(self.config, same_names=["num_attention_heads", "n_head"])
+        repair_config(self.config, same_names=["hidden_size", "n_embd", "n_embed"])
+        repair_config(self.config, same_names=["num_hidden_layers", "n_layer"])
+        if self.finetune_config:
+            self.config["vocab_size"] = self.finetune_config.vocab_size
+        return
diff --git a/lightllm/models/llama/model.py b/lightllm/models/llama/model.py
@@ -68,7 +68,10 @@ def _init_custom(self):
         """
         模型特殊的一些初始化
         """
-        if self.config.get("use_rope_yarn", False):
+        if self.config.get("use_rope_yarn", False) or (
+            self.config.get("rope_scaling", None) is not None
+            and self.config.get("rope_scaling", {}).get("type", "base") == "yarn"
+        ):
             self._init_to_get_yarn_rotary()
         elif self.config.get("use_dynamic_ntk", False) or (
             self.config.get("rope_scaling", None) is not None
@@ -215,7 +218,8 @@ def _init_to_get_yarn_rotary(self):
             scale = 1.0
         else:
             scale = self.config.get("rope_scaling", {}).get("factor", 1.0)
-        original_max_position_embeddings = self.config.get("original_max_position_embeddings", 2048)
+        rope_config = self.config.get("rope_scaling", {})
+        original_max_position_embeddings = rope_config.get("original_max_position_embeddings", 2048)
         extrapolation_factor = 1.0
         attn_factor = 1.0
         beta_fast = 32.0
diff --git a/lightllm/models/vit/triton_kernel/flashattention_nopad.py b/lightllm/models/vit/triton_kernel/flashattention_nopad.py
@@ -175,7 +175,6 @@ def flash_attention_v3_fwd(
             v,
             None,
             None,  # k_new, v_new
-            None,  # qv
             o,  # out
             None,
             None,
@@ -193,7 +192,7 @@ def flash_attention_v3_fwd(
             None,
             None,
             softmax_scale,
-            causal=False,
+            False,  # causal
             window_size=(-1, -1),
             softcap=0.0,
             num_splits=1,
diff --git a/lightllm/server/api_http.py b/lightllm/server/api_http.py
@@ -385,57 +385,6 @@ async def tokens(request: Request):
         return create_error_response(HTTPStatus.EXPECTATION_FAILED, f"error: {str(e)}")
 
 
-# for special cases
-@app.get("/tokens_num")
-@app.post("/tokens_num")
-async def tokens_num(request: Request):
-    try:
-        request_dict = await request.json()
-        prompt = request_dict.pop("text")
-        sample_params_dict = request_dict.pop("parameters", {})
-
-        sampling_params = SamplingParams()
-        sampling_params.init(tokenizer=g_objs.httpserver_manager.tokenizer, **sample_params_dict)
-        sampling_params.verify()
-
-        multimodal_params_dict = request_dict.get("multimodal_params", {})
-        images_size = multimodal_params_dict.get("images", [])
-
-        prompt_ids = g_objs.httpserver_manager.tokenizer.encode(prompt, None, add_special_tokens=False)
-        image_tokens = 0
-        img_count = 0
-        max_num = 0
-        if sampling_params.image_max_patch_num >= 0:
-            max_num = sampling_params.image_max_patch_num
-        else:
-            num_images = len(images_size)
-            if num_images == 1:
-                max_num = 12
-            elif num_images > 1 and num_images <= 6:
-                max_num = 6
-            elif num_images > 6:
-                max_num = 0
-        image_token_length = int(os.environ.get("INTERNVL_IMAGE_LENGTH", 256))
-
-        for img_size in images_size:
-            img_count += 1
-            image_tokens += (
-                g_objs.httpserver_manager.tokenizer.get_image_patch_func(
-                    img_size[0], img_size[1], max_num=max_num, use_thumbnail=True
-                )
-                * image_token_length
-            )
-
-        num_tokens = len(prompt_ids) + image_tokens + img_count
-
-        return JSONResponse(
-            {"ntokens": num_tokens},
-            status_code=200,
-        )
-    except Exception as e:
-        return create_error_response(HTTPStatus.EXPECTATION_FAILED, f"error: {str(e)}")
-
-
 @app.get("/metrics")
 async def metrics() -> Response:
     data = await g_objs.metric_client.generate_latest()
diff --git a/lightllm/server/api_start.py b/lightllm/server/api_start.py
@@ -11,6 +11,7 @@
 from .visualserver.manager import start_visual_process
 from lightllm.utils.log_utils import init_logger
 from lightllm.utils.envs_utils import set_env_start_args, set_unique_server_name, get_unique_server_name
+from lightllm.utils.envs_utils import get_lightllm_gunicorn_time_out_seconds
 from .detokenization.manager import start_detokenization_process
 from .router.manager import start_router_process
 from lightllm.utils.process_check import is_process_active
@@ -95,6 +96,10 @@ def normal_or_p_d_start(args):
     if args.use_dynamic_prompt_cache:
         assert args.token_healing_mode is False
 
+    # chuncked prefill 需要和 dynamic_prompt_cache 一起使能
+    if not args.disable_chunked_prefill:
+        assert args.use_dynamic_prompt_cache is True
+
     # 部分模式还不能支持与高级动态调度算法协同，to do.
     if args.diverse_mode:
         assert args.router_token_ratio == 0.0
@@ -246,6 +251,8 @@ def normal_or_p_d_start(args):
         "--error-logfile",
         "-",
         "lightllm.server.api_http:app",
+        "--timeout",
+        f"{get_lightllm_gunicorn_time_out_seconds()}",
     ]
 
     # 启动子进程
@@ -303,6 +310,8 @@ def pd_master_start(args):
         "-",
         "--preload",
         "lightllm.server.api_http:app",
+        "--timeout",
+        f"{get_lightllm_gunicorn_time_out_seconds()}",
     ]
 
     http_server_process = subprocess.Popen(command)
diff --git a/lightllm/server/api_tgi.py b/lightllm/server/api_tgi.py
@@ -184,6 +184,7 @@ async def stream_results() -> AsyncGenerator[bytes, None]:
                     ret["details"] = {
                         "generated_tokens": len(final_output),
                         "finish_reason": finish_status.get_finish_reason(),
+                        "prompt_tokens": metadata.get("prompt_tokens", 0),
                     }
 
             yield ("data:" + json.dumps(ret, ensure_ascii=False) + "\n\n").encode("utf-8")
diff --git a/lightllm/server/httpserver/manager.py b/lightllm/server/httpserver/manager.py
@@ -288,7 +288,11 @@ async def generate(
         except Exception as e:
             logger.error(f"group_request_id: {group_request_id} has exception {str(e)}")
             # error need to release multimodel resources.
-            await self._release_multimodal_resources(multimodal_params)
+            # 对于还没有形成正式请求对象管理的多模态资源，需要单独自己释放
+            # 已经放入到 req_id_to_out_inf 中的请求对象，由统一的回收循环
+            # 进行回收。
+            if group_request_id not in self.req_id_to_out_inf:
+                await self._release_multimodal_resources(multimodal_params)
             await self.abort(group_request_id)
             raise e
         return
@@ -381,7 +385,7 @@ async def transfer_to_next_module_or_node(
                 await self.transfer_to_next_module(group_req_objs)
             return
         # 多节点纯tp 的slave节点，需要按照接受到请求的顺序转发，这需要锁和排队机制来保证。
-        # self.request_order_queue 实现了一种简单的排队取出机制，这样master 和 slave 
+        # self.request_order_queue 实现了一种简单的排队取出机制，这样master 和 slave
         # 节点的请求到达各自节点的router的顺序才是一致的，才能完成同步同态调度。
         if self.is_multinode_tp_slave:
             while True:
@@ -584,7 +588,7 @@ async def handle_loop(self):
         if self.pd_mode.is_P_or_D():
             self.forwarding_queue = AsyncQueue()
             asyncio.create_task(self.pd_handle_loop())
-        
+
         # 多节点tp模式下的slave节点，需要开启一个协程task用来接收
         # master 转发过来的请求对象。
         if self.is_multinode_tp_slave:
diff --git a/lightllm/server/multimodal_params.py b/lightllm/server/multimodal_params.py
@@ -33,6 +33,10 @@ def preload(self):
                 img_data = ret.content
             elif self._type == "base64":
                 img_data = base64.b64decode(self._data)
+            elif self._type == "image_size":
+                self.image_w = self._data[0]
+                self.image_h = self._data[1]
+                return
             else:
                 raise ValueError(f"cannot read image which type is {self._type}!")
 
diff --git a/lightllm/server/router/model_infer/mode_backend/base_backend.py b/lightllm/server/router/model_infer/mode_backend/base_backend.py
@@ -27,7 +27,12 @@
 from lightllm.models.gemma_2b.model import Gemma_2bTpPartModel
 from lightllm.models.phi3.model import Phi3TpPartModel
 from lightllm.models.deepseek2.model import Deepseek2TpPartModel
-from lightllm.models.internvl.model import InternVLLlamaTpPartModel, InternVLPhi3TpPartModel, InternVLQwen2TpPartModel
+from lightllm.models.internvl.model import (
+    InternVLLlamaTpPartModel,
+    InternVLPhi3TpPartModel,
+    InternVLQwen2TpPartModel,
+    InternVLDeepSeek2TpPartModel,
+)
 from lightllm.models.internvl.model import InternVLInternlm2TpPartModel
 from lightllm.models.qwen2_vl.model import Qwen2VLTpPartModel
 from lightllm.models.qwen2_reward.model import Qwen2RewardTpPartModel
@@ -199,6 +204,8 @@ def init_model(self, kvargs):
                     self.model = InternVLLlamaTpPartModel(model_kvargs)
                 elif llm_model_type == "qwen2":
                     self.model = InternVLQwen2TpPartModel(model_kvargs)
+                elif llm_model_type == "deepseek_v3":
+                    self.model = InternVLDeepSeek2TpPartModel(model_kvargs)
                 self.is_multimodal = True
             else:
                 raise Exception(f"can not support {self.model_type} now")
diff --git a/lightllm/utils/envs_utils.py b/lightllm/utils/envs_utils.py
@@ -52,3 +52,7 @@ def enable_env_vars(args):
 @lru_cache(maxsize=None)
 def get_deepep_num_max_dispatch_tokens_per_rank():
     return int(os.getenv("NUM_MAX_DISPATCH_TOKENS_PER_RANK", 256))
+
+
+def get_lightllm_gunicorn_time_out_seconds():
+    return int(os.getenv("LIGHTLMM_GUNICORN_TIME_OUT", 180))

Original file line number	Diff line number	Diff line change
`@@ -184,6 +184,7 @@ async def stream_results() -> AsyncGenerator[bytes, None]:`
`184`	`184`	`ret["details"] = {`
`185`	`185`	`"generated_tokens": len(final_output),`
`186`	`186`	`"finish_reason": finish_status.get_finish_reason(),`
	`187`	`+ "prompt_tokens": metadata.get("prompt_tokens", 0),`
`187`	`188`	`}`
`188`	`189`
`189`	`190`	`yield ("data:" + json.dumps(ret, ensure_ascii=False) + "\n\n").encode("utf-8")`