Skip to content

Commit 8c3e38d

Browse files
author
sangchengmeng
committed
Merge branch 'main' into tokens_num
2 parents 6f73a16 + d872211 commit 8c3e38d

File tree

11 files changed

+69
-60
lines changed

11 files changed

+69
-60
lines changed

Dockerfile

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -40,5 +40,8 @@ RUN pip install -r /lightllm/requirements.txt --no-cache-dir --ignore-installed
4040

4141
RUN pip install --no-cache-dir nvidia-nccl-cu12==2.25.1 # for allreduce hang issues in multinode H100
4242

43+
RUN git clone https://github.com/Dao-AILab/flash-attention.git -b v2.7.4.post1
44+
RUN cd flash-attention/hopper && NVCC_THREADS=128 python setup.py install
45+
4346
COPY . /lightllm
4447
RUN pip install -e /lightllm --no-cache-dir

lightllm/models/internvl/model.py

Lines changed: 26 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44
from lightllm.models.llama.model import LlamaTpPartModel
55
from lightllm.models.phi3.model import Phi3TpPartModel
66
from lightllm.models.qwen2.model import Qwen2TpPartModel
7+
from lightllm.models.deepseek2.model import Deepseek2TpPartModel
78
from lightllm.models.qwen_vl.layer_infer.pre_layer_infer import LlamaMultimodalPreLayerInfer
89
from lightllm.server.multimodal_params import MultimodalParams, ImageItem
910
from lightllm.common.build_utils import repair_config
@@ -26,10 +27,10 @@
2627
IMG_END_TOKEN = "</img>"
2728
IMG_TOKEN = "<image>"
2829

30+
2931
# Warp of the origal tokenizer
3032
class InternvlTokenizer:
3133
def __init__(self, tokenizer, model_cfg, **kwargs):
32-
3334
self.llm_model_type = model_cfg.get("llm_config").get("model_type")
3435
self.tokenizer = tokenizer
3536
self.image_length = int(os.environ.get("INTERNVL_IMAGE_LENGTH", 256))
@@ -200,3 +201,27 @@ def _init_config(self):
200201
if self.finetune_config:
201202
self.config["vocab_size"] = self.finetune_config.vocab_size
202203
return
204+
205+
206+
class InternVLDeepSeek2TpPartModel(Deepseek2TpPartModel):
207+
# support Deepseek2,3,R1
208+
# weight class
209+
pre_and_post_weight_class = InternVLLlamaPreAndPostLayerWeight
210+
211+
# infer class
212+
pre_layer_infer_class = LlamaMultimodalPreLayerInfer
213+
214+
def __init__(self, kvargs):
215+
super().__init__(kvargs)
216+
return
217+
218+
def _init_config(self):
219+
with open(os.path.join(self.weight_dir_, "config.json"), "r") as json_file:
220+
self.config = json.load(json_file)["llm_config"]
221+
# rename keys
222+
repair_config(self.config, same_names=["num_attention_heads", "n_head"])
223+
repair_config(self.config, same_names=["hidden_size", "n_embd", "n_embed"])
224+
repair_config(self.config, same_names=["num_hidden_layers", "n_layer"])
225+
if self.finetune_config:
226+
self.config["vocab_size"] = self.finetune_config.vocab_size
227+
return

lightllm/models/llama/model.py

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -68,7 +68,10 @@ def _init_custom(self):
6868
"""
6969
模型特殊的一些初始化
7070
"""
71-
if self.config.get("use_rope_yarn", False):
71+
if self.config.get("use_rope_yarn", False) or (
72+
self.config.get("rope_scaling", None) is not None
73+
and self.config.get("rope_scaling", {}).get("type", "base") == "yarn"
74+
):
7275
self._init_to_get_yarn_rotary()
7376
elif self.config.get("use_dynamic_ntk", False) or (
7477
self.config.get("rope_scaling", None) is not None
@@ -215,7 +218,8 @@ def _init_to_get_yarn_rotary(self):
215218
scale = 1.0
216219
else:
217220
scale = self.config.get("rope_scaling", {}).get("factor", 1.0)
218-
original_max_position_embeddings = self.config.get("original_max_position_embeddings", 2048)
221+
rope_config = self.config.get("rope_scaling", {})
222+
original_max_position_embeddings = rope_config.get("original_max_position_embeddings", 2048)
219223
extrapolation_factor = 1.0
220224
attn_factor = 1.0
221225
beta_fast = 32.0

lightllm/models/vit/triton_kernel/flashattention_nopad.py

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -175,7 +175,6 @@ def flash_attention_v3_fwd(
175175
v,
176176
None,
177177
None, # k_new, v_new
178-
None, # qv
179178
o, # out
180179
None,
181180
None,
@@ -193,7 +192,7 @@ def flash_attention_v3_fwd(
193192
None,
194193
None,
195194
softmax_scale,
196-
causal=False,
195+
False, # causal
197196
window_size=(-1, -1),
198197
softcap=0.0,
199198
num_splits=1,

lightllm/server/api_http.py

Lines changed: 0 additions & 51 deletions
Original file line numberDiff line numberDiff line change
@@ -385,57 +385,6 @@ async def tokens(request: Request):
385385
return create_error_response(HTTPStatus.EXPECTATION_FAILED, f"error: {str(e)}")
386386

387387

388-
# for special cases
389-
@app.get("/tokens_num")
390-
@app.post("/tokens_num")
391-
async def tokens_num(request: Request):
392-
try:
393-
request_dict = await request.json()
394-
prompt = request_dict.pop("text")
395-
sample_params_dict = request_dict.pop("parameters", {})
396-
397-
sampling_params = SamplingParams()
398-
sampling_params.init(tokenizer=g_objs.httpserver_manager.tokenizer, **sample_params_dict)
399-
sampling_params.verify()
400-
401-
multimodal_params_dict = request_dict.get("multimodal_params", {})
402-
images_size = multimodal_params_dict.get("images", [])
403-
404-
prompt_ids = g_objs.httpserver_manager.tokenizer.encode(prompt, None, add_special_tokens=False)
405-
image_tokens = 0
406-
img_count = 0
407-
max_num = 0
408-
if sampling_params.image_max_patch_num >= 0:
409-
max_num = sampling_params.image_max_patch_num
410-
else:
411-
num_images = len(images_size)
412-
if num_images == 1:
413-
max_num = 12
414-
elif num_images > 1 and num_images <= 6:
415-
max_num = 6
416-
elif num_images > 6:
417-
max_num = 0
418-
image_token_length = int(os.environ.get("INTERNVL_IMAGE_LENGTH", 256))
419-
420-
for img_size in images_size:
421-
img_count += 1
422-
image_tokens += (
423-
g_objs.httpserver_manager.tokenizer.get_image_patch_func(
424-
img_size[0], img_size[1], max_num=max_num, use_thumbnail=True
425-
)
426-
* image_token_length
427-
)
428-
429-
num_tokens = len(prompt_ids) + image_tokens + img_count
430-
431-
return JSONResponse(
432-
{"ntokens": num_tokens},
433-
status_code=200,
434-
)
435-
except Exception as e:
436-
return create_error_response(HTTPStatus.EXPECTATION_FAILED, f"error: {str(e)}")
437-
438-
439388
@app.get("/metrics")
440389
async def metrics() -> Response:
441390
data = await g_objs.metric_client.generate_latest()

lightllm/server/api_start.py

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@
1111
from .visualserver.manager import start_visual_process
1212
from lightllm.utils.log_utils import init_logger
1313
from lightllm.utils.envs_utils import set_env_start_args, set_unique_server_name, get_unique_server_name
14+
from lightllm.utils.envs_utils import get_lightllm_gunicorn_time_out_seconds
1415
from .detokenization.manager import start_detokenization_process
1516
from .router.manager import start_router_process
1617
from lightllm.utils.process_check import is_process_active
@@ -95,6 +96,10 @@ def normal_or_p_d_start(args):
9596
if args.use_dynamic_prompt_cache:
9697
assert args.token_healing_mode is False
9798

99+
# chuncked prefill 需要和 dynamic_prompt_cache 一起使能
100+
if not args.disable_chunked_prefill:
101+
assert args.use_dynamic_prompt_cache is True
102+
98103
# 部分模式还不能支持与高级动态调度算法协同,to do.
99104
if args.diverse_mode:
100105
assert args.router_token_ratio == 0.0
@@ -246,6 +251,8 @@ def normal_or_p_d_start(args):
246251
"--error-logfile",
247252
"-",
248253
"lightllm.server.api_http:app",
254+
"--timeout",
255+
f"{get_lightllm_gunicorn_time_out_seconds()}",
249256
]
250257

251258
# 启动子进程
@@ -303,6 +310,8 @@ def pd_master_start(args):
303310
"-",
304311
"--preload",
305312
"lightllm.server.api_http:app",
313+
"--timeout",
314+
f"{get_lightllm_gunicorn_time_out_seconds()}",
306315
]
307316

308317
http_server_process = subprocess.Popen(command)

lightllm/server/api_tgi.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -184,6 +184,7 @@ async def stream_results() -> AsyncGenerator[bytes, None]:
184184
ret["details"] = {
185185
"generated_tokens": len(final_output),
186186
"finish_reason": finish_status.get_finish_reason(),
187+
"prompt_tokens": metadata.get("prompt_tokens", 0),
187188
}
188189

189190
yield ("data:" + json.dumps(ret, ensure_ascii=False) + "\n\n").encode("utf-8")

lightllm/server/httpserver/manager.py

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -288,7 +288,11 @@ async def generate(
288288
except Exception as e:
289289
logger.error(f"group_request_id: {group_request_id} has exception {str(e)}")
290290
# error need to release multimodel resources.
291-
await self._release_multimodal_resources(multimodal_params)
291+
# 对于还没有形成正式请求对象管理的多模态资源,需要单独自己释放
292+
# 已经放入到 req_id_to_out_inf 中的请求对象,由统一的回收循环
293+
# 进行回收。
294+
if group_request_id not in self.req_id_to_out_inf:
295+
await self._release_multimodal_resources(multimodal_params)
292296
await self.abort(group_request_id)
293297
raise e
294298
return
@@ -381,7 +385,7 @@ async def transfer_to_next_module_or_node(
381385
await self.transfer_to_next_module(group_req_objs)
382386
return
383387
# 多节点纯tp 的slave节点,需要按照接受到请求的顺序转发,这需要锁和排队机制来保证。
384-
# self.request_order_queue 实现了一种简单的排队取出机制,这样master 和 slave
388+
# self.request_order_queue 实现了一种简单的排队取出机制,这样master 和 slave
385389
# 节点的请求到达各自节点的router的顺序才是一致的,才能完成同步同态调度。
386390
if self.is_multinode_tp_slave:
387391
while True:
@@ -584,7 +588,7 @@ async def handle_loop(self):
584588
if self.pd_mode.is_P_or_D():
585589
self.forwarding_queue = AsyncQueue()
586590
asyncio.create_task(self.pd_handle_loop())
587-
591+
588592
# 多节点tp模式下的slave节点,需要开启一个协程task用来接收
589593
# master 转发过来的请求对象。
590594
if self.is_multinode_tp_slave:

lightllm/server/multimodal_params.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,10 @@ def preload(self):
3333
img_data = ret.content
3434
elif self._type == "base64":
3535
img_data = base64.b64decode(self._data)
36+
elif self._type == "image_size":
37+
self.image_w = self._data[0]
38+
self.image_h = self._data[1]
39+
return
3640
else:
3741
raise ValueError(f"cannot read image which type is {self._type}!")
3842

lightllm/server/router/model_infer/mode_backend/base_backend.py

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,12 @@
2727
from lightllm.models.gemma_2b.model import Gemma_2bTpPartModel
2828
from lightllm.models.phi3.model import Phi3TpPartModel
2929
from lightllm.models.deepseek2.model import Deepseek2TpPartModel
30-
from lightllm.models.internvl.model import InternVLLlamaTpPartModel, InternVLPhi3TpPartModel, InternVLQwen2TpPartModel
30+
from lightllm.models.internvl.model import (
31+
InternVLLlamaTpPartModel,
32+
InternVLPhi3TpPartModel,
33+
InternVLQwen2TpPartModel,
34+
InternVLDeepSeek2TpPartModel,
35+
)
3136
from lightllm.models.internvl.model import InternVLInternlm2TpPartModel
3237
from lightllm.models.qwen2_vl.model import Qwen2VLTpPartModel
3338
from lightllm.models.qwen2_reward.model import Qwen2RewardTpPartModel
@@ -199,6 +204,8 @@ def init_model(self, kvargs):
199204
self.model = InternVLLlamaTpPartModel(model_kvargs)
200205
elif llm_model_type == "qwen2":
201206
self.model = InternVLQwen2TpPartModel(model_kvargs)
207+
elif llm_model_type == "deepseek_v3":
208+
self.model = InternVLDeepSeek2TpPartModel(model_kvargs)
202209
self.is_multimodal = True
203210
else:
204211
raise Exception(f"can not support {self.model_type} now")

0 commit comments

Comments
 (0)