Merge branch 'main' into release/1.6

Jintao-Huang · Jintao-Huang · commit 337873844605 · 2024-02-07T15:39:49.000+08:00
diff --git a/README.md b/README.md
@@ -208,12 +208,24 @@ app_ui_main(infer_args)
     - [cogagent](https://github.com/THUDM/CogVLM) series: cogagent-18b-chat, cogagent-18b-instruct.
     - [internlm-xcomposer2](https://github.com/InternLM/InternLM-XComposer) series: internlm-xcomposer2-7b-chat.
   - General:
-    - [qwen](https://github.com/QwenLM/Qwen) series: qwen-1_8b, qwen-1_8b-chat, qwen-1_8b-chat-int4, qwen-1_8b-chat-int8, qwen-7b, qwen-7b-chat, qwen-7b-chat-int4, qwen-7b-chat-int8, qwen-14b, qwen-14b-chat, qwen-14b-chat-int4, qwen-14b-chat-int8, qwen-72b, qwen-72b-chat, qwen-72b-chat-int4, qwen-72b-chat-int8.
-    - [qwen1.5](https://github.com/QwenLM/Qwen1.5) series: qwen1half-0_5b, qwen1half-0_5b-chat, qwen1half-0_5b-chat-int4, qwen1half-0_5b-chat-int8, qwen1half-1_8b, qwen1half-1_8b-chat, qwen1half-1_8b-chat-int4, qwen1half-1_8b-chat-int8, qwen1half-4b, qwen1half-4b-chat, qwen1half-4b-chat-int4, qwen1half-4b-chat-int8, qwen1half-7b, qwen1half-7b-chat, qwen1half-7b-chat-int4, qwen1half-7b-chat-int8, qwen1half-14b, qwen1half-14b-chat, qwen1half-14b-chat-int4, qwen1half-14b-chat-int8, qwen1half-72b, qwen1half-72b-chat, qwen1half-72b-chat-int4, qwen1half-72b-chat-int8.
+    - [qwen](https://github.com/QwenLM/Qwen) series:
+      - qwen-1_8b, qwen-1_8b-chat, qwen-1_8b-chat-int4, qwen-1_8b-chat-int8.
+      - qwen-7b, qwen-7b-chat, qwen-7b-chat-int4, qwen-7b-chat-int8.
+      - qwen-14b, qwen-14b-chat, qwen-14b-chat-int4, qwen-14b-chat-int8.
+      - qwen-72b, qwen-72b-chat, qwen-72b-chat-int4, qwen-72b-chat-int8.
+    - [qwen1.5](https://github.com/QwenLM/Qwen1.5) series:
+      - qwen1half-0_5b, qwen1half-0_5b-chat, qwen1half-0_5b-chat-int4, qwen1half-0_5b-chat-int8.
+      - qwen1half-1_8b, qwen1half-1_8b-chat, qwen1half-1_8b-chat-int4, qwen1half-1_8b-chat-int8.
+      - qwen1half-4b, qwen1half-4b-chat, qwen1half-4b-chat-int4, qwen1half-4b-chat-int8.
+      - qwen1half-7b, qwen1half-7b-chat, qwen1half-7b-chat-int4, qwen1half-7b-chat-int8.
+      - qwen1half-14b, qwen1half-14b-chat, qwen1half-14b-chat-int4, qwen1half-14b-chat-int8.
+      - qwen1half-72b, qwen1half-72b-chat, qwen1half-72b-chat-int4, qwen1half-72b-chat-int8.
     - [chatglm](https://github.com/THUDM/ChatGLM-6B) series: chatglm2-6b, chatglm2-6b-32k, chatglm3-6b-base, chatglm3-6b, chatglm3-6b-32k.
     - [llama](https://github.com/facebookresearch/llama) series: llama2-7b, llama2-7b-chat, llama2-13b, llama2-13b-chat, llama2-70b, llama2-70b-chat.
     - [yi](https://github.com/01-ai/Yi) series: yi-6b, yi-6b-200k, yi-6b-chat, yi-34b, yi-34b-200k, yi-34b-chat.
-    - [internlm](https://github.com/InternLM/InternLM) series: internlm-7b, internlm-7b-chat, internlm-7b-chat-8k, internlm-20b, internlm-20b-chat, internlm2-7b-base, internlm2-7b, internlm2-7b-sft-chat, internlm2-7b-chat, internlm2-20b-base, internlm2-20b, internlm2-20b-sft-chat, internlm2-20b-chat.
+    - [internlm](https://github.com/InternLM/InternLM) series:
+      - internlm-7b, internlm-7b-chat, internlm-7b-chat-8k, internlm-20b, internlm-20b-chat.
+      - internlm2-7b-base, internlm2-7b, internlm2-7b-sft-chat, internlm2-7b-chat, internlm2-20b-base, internlm2-20b, internlm2-20b-sft-chat, internlm2-20b-chat.
     - [deepseek](https://github.com/deepseek-ai/deepseek-LLM) series: deepseek-7b, deepseek-7b-chat, deepseek-67b, deepseek-67b-chat, deepseek-moe-16b, deepseek-moe-16b-chat.
     - [openbuddy](https://github.com/OpenBuddy/OpenBuddy) series: openbuddy-llama2-13b-chat, openbuddy-llama-65b-chat, openbuddy-llama2-70b-chat, openbuddy-mistral-7b-chat, openbuddy-zephyr-7b-chat, openbuddy-deepseek-67b-chat.
     - [mistral](https://github.com/mistralai/mistral-src) series: mistral-7b, mistral-7b-instruct, mistral-7b-instruct-v2.
diff --git a/README_CN.md b/README_CN.md
@@ -207,12 +207,24 @@ app_ui_main(infer_args)
     - [cogagent](https://github.com/THUDM/CogVLM) 系列: cogagent-18b-chat, cogagent-18b-instruct.
     - [internlm-xcomposer2](https://github.com/InternLM/InternLM-XComposer) 系列: internlm-xcomposer2-7b-chat.
   - 通用:
-    - [qwen](https://github.com/QwenLM/Qwen) 系列: qwen-1_8b, qwen-1_8b-chat, qwen-1_8b-chat-int4, qwen-1_8b-chat-int8, qwen-7b, qwen-7b-chat, qwen-7b-chat-int4, qwen-7b-chat-int8, qwen-14b, qwen-14b-chat, qwen-14b-chat-int4, qwen-14b-chat-int8, qwen-72b, qwen-72b-chat, qwen-72b-chat-int4, qwen-72b-chat-int8.
-    - [qwen1.5](https://github.com/QwenLM/Qwen1.5) series: qwen1half-0_5b, qwen1half-0_5b-chat, qwen1half-0_5b-chat-int4, qwen1half-0_5b-chat-int8, qwen1half-1_8b, qwen1half-1_8b-chat, qwen1half-1_8b-chat-int4, qwen1half-1_8b-chat-int8, qwen1half-4b, qwen1half-4b-chat, qwen1half-4b-chat-int4, qwen1half-4b-chat-int8, qwen1half-7b, qwen1half-7b-chat, qwen1half-7b-chat-int4, qwen1half-7b-chat-int8, qwen1half-14b, qwen1half-14b-chat, qwen1half-14b-chat-int4, qwen1half-14b-chat-int8, qwen1half-72b, qwen1half-72b-chat, qwen1half-72b-chat-int4, qwen1half-72b-chat-int8.
+    - [qwen](https://github.com/QwenLM/Qwen) 系列:
+      - qwen-1_8b, qwen-1_8b-chat, qwen-1_8b-chat-int4, qwen-1_8b-chat-int8.
+      - qwen-7b, qwen-7b-chat, qwen-7b-chat-int4, qwen-7b-chat-int8.
+      - qwen-14b, qwen-14b-chat, qwen-14b-chat-int4, qwen-14b-chat-int8.
+      - qwen-72b, qwen-72b-chat, qwen-72b-chat-int4, qwen-72b-chat-int8.
+    - [qwen1.5](https://github.com/QwenLM/Qwen1.5) 系列:
+      - qwen1half-0_5b, qwen1half-0_5b-chat, qwen1half-0_5b-chat-int4, qwen1half-0_5b-chat-int8.
+      - qwen1half-1_8b, qwen1half-1_8b-chat, qwen1half-1_8b-chat-int4, qwen1half-1_8b-chat-int8.
+      - qwen1half-4b, qwen1half-4b-chat, qwen1half-4b-chat-int4, qwen1half-4b-chat-int8.
+      - qwen1half-7b, qwen1half-7b-chat, qwen1half-7b-chat-int4, qwen1half-7b-chat-int8.
+      - qwen1half-14b, qwen1half-14b-chat, qwen1half-14b-chat-int4, qwen1half-14b-chat-int8.
+      - qwen1half-72b, qwen1half-72b-chat, qwen1half-72b-chat-int4, qwen1half-72b-chat-int8.
     - [chatglm](https://github.com/THUDM/ChatGLM-6B) 系列: chatglm2-6b, chatglm2-6b-32k, chatglm3-6b-base, chatglm3-6b, chatglm3-6b-32k.
     - [llama](https://github.com/facebookresearch/llama) 系列: llama2-7b, llama2-7b-chat, llama2-13b, llama2-13b-chat, llama2-70b, llama2-70b-chat.
     - [yi](https://github.com/01-ai/Yi) 系列: yi-6b, yi-6b-200k, yi-6b-chat, yi-34b, yi-34b-200k, yi-34b-chat.
-    - [internlm](https://github.com/InternLM/InternLM) 系列: internlm-7b, internlm-7b-chat, internlm-7b-chat-8k, internlm-20b, internlm-20b-chat, internlm2-7b-base, internlm2-7b, internlm2-7b-sft-chat, internlm2-7b-chat, internlm2-20b-base, internlm2-20b, internlm2-20b-sft-chat, internlm2-20b-chat.
+    - [internlm](https://github.com/InternLM/InternLM) 系列:
+      - internlm-7b, internlm-7b-chat, internlm-7b-chat-8k, internlm-20b, internlm-20b-chat.
+      - internlm2-7b-base, internlm2-7b, internlm2-7b-sft-chat, internlm2-7b-chat, internlm2-20b-base, internlm2-20b, internlm2-20b-sft-chat, internlm2-20b-chat.
     - [deepseek](https://github.com/deepseek-ai/deepseek-LLM) 系列: deepseek-7b, deepseek-7b-chat, deepseek-67b, deepseek-67b-chat, deepseek-moe-16b, deepseek-moe-16b-chat.
     - [openbuddy](https://github.com/OpenBuddy/OpenBuddy) 系列: openbuddy-llama2-13b-chat, openbuddy-llama-65b-chat, openbuddy-llama2-70b-chat, openbuddy-mistral-7b-chat, openbuddy-zephyr-7b-chat, openbuddy-deepseek-67b-chat.
     - [mistral](https://github.com/mistralai/mistral-src) 系列: mistral-7b, mistral-7b-instruct, mistral-7b-instruct-v2.
diff --git a/docs/source/LLM/支持的模型和数据集.md b/docs/source/LLM/支持的模型和数据集.md
@@ -42,18 +42,18 @@
 |qwen1half-7b-chat|[qwen/Qwen1.5-7B-Chat](https://modelscope.cn/models/qwen/Qwen1.5-7B-Chat/summary)|q_proj, k_proj, v_proj|qwen|&#x2714;|&#x2714;|transformers>=4.37|
 |qwen1half-14b-chat|[qwen/Qwen1.5-14B-Chat](https://modelscope.cn/models/qwen/Qwen1.5-14B-Chat/summary)|q_proj, k_proj, v_proj|qwen|&#x2714;|&#x2714;|transformers>=4.37|
 |qwen1half-72b-chat|[qwen/Qwen1.5-72B-Chat](https://modelscope.cn/models/qwen/Qwen1.5-72B-Chat/summary)|q_proj, k_proj, v_proj|qwen|&#x2714;|&#x2714;|transformers>=4.37|
-|qwen1half-0_5b-chat-int8|[qwen/Qwen1.5-0.5B-Chat-GPTQ-Int8](https://modelscope.cn/models/qwen/Qwen1.5-0.5B-Chat-GPTQ-Int8/summary)|q_proj, k_proj, v_proj|qwen|&#x2714;|&#x2714;|auto_gptq>=0.5, transformers>=4.37|
 |qwen1half-0_5b-chat-int4|[qwen/Qwen1.5-0.5B-Chat-GPTQ-Int4](https://modelscope.cn/models/qwen/Qwen1.5-0.5B-Chat-GPTQ-Int4/summary)|q_proj, k_proj, v_proj|qwen|&#x2714;|&#x2714;|auto_gptq>=0.5, transformers>=4.37|
-|qwen1half-1_8b-chat-int8|[qwen/Qwen1.5-1.8B-Chat-GPTQ-Int8](https://modelscope.cn/models/qwen/Qwen1.5-1.8B-Chat-GPTQ-Int8/summary)|q_proj, k_proj, v_proj|qwen|&#x2714;|&#x2714;|auto_gptq>=0.5, transformers>=4.37|
 |qwen1half-1_8b-chat-int4|[qwen/Qwen1.5-1.8B-Chat-GPTQ-Int4](https://modelscope.cn/models/qwen/Qwen1.5-1.8B-Chat-GPTQ-Int4/summary)|q_proj, k_proj, v_proj|qwen|&#x2714;|&#x2714;|auto_gptq>=0.5, transformers>=4.37|
-|qwen1half-4b-chat-int8|[qwen/Qwen1.5-4B-Chat-GPTQ-Int8](https://modelscope.cn/models/qwen/Qwen1.5-4B-Chat-GPTQ-Int8/summary)|q_proj, k_proj, v_proj|qwen|&#x2714;|&#x2714;|auto_gptq>=0.5, transformers>=4.37|
 |qwen1half-4b-chat-int4|[qwen/Qwen1.5-4B-Chat-GPTQ-Int4](https://modelscope.cn/models/qwen/Qwen1.5-4B-Chat-GPTQ-Int4/summary)|q_proj, k_proj, v_proj|qwen|&#x2714;|&#x2714;|auto_gptq>=0.5, transformers>=4.37|
-|qwen1half-7b-chat-int8|[qwen/Qwen1.5-7B-Chat-GPTQ-Int8](https://modelscope.cn/models/qwen/Qwen1.5-7B-Chat-GPTQ-Int8/summary)|q_proj, k_proj, v_proj|qwen|&#x2714;|&#x2714;|auto_gptq>=0.5, transformers>=4.37|
 |qwen1half-7b-chat-int4|[qwen/Qwen1.5-7B-Chat-GPTQ-Int4](https://modelscope.cn/models/qwen/Qwen1.5-7B-Chat-GPTQ-Int4/summary)|q_proj, k_proj, v_proj|qwen|&#x2714;|&#x2714;|auto_gptq>=0.5, transformers>=4.37|
-|qwen1half-14b-chat-int8|[qwen/Qwen1.5-14B-Chat-GPTQ-Int8](https://modelscope.cn/models/qwen/Qwen1.5-14B-Chat-GPTQ-Int8/summary)|q_proj, k_proj, v_proj|qwen|&#x2714;|&#x2714;|auto_gptq>=0.5, transformers>=4.37|
 |qwen1half-14b-chat-int4|[qwen/Qwen1.5-14B-Chat-GPTQ-Int4](https://modelscope.cn/models/qwen/Qwen1.5-14B-Chat-GPTQ-Int4/summary)|q_proj, k_proj, v_proj|qwen|&#x2714;|&#x2714;|auto_gptq>=0.5, transformers>=4.37|
-|qwen1half-72b-chat-int8|[qwen/Qwen1.5-72B-Chat-GPTQ-Int8](https://modelscope.cn/models/qwen/Qwen1.5-72B-Chat-GPTQ-Int8/summary)|q_proj, k_proj, v_proj|qwen|&#x2714;|&#x2714;|auto_gptq>=0.5, transformers>=4.37|
 |qwen1half-72b-chat-int4|[qwen/Qwen1.5-72B-Chat-GPTQ-Int4](https://modelscope.cn/models/qwen/Qwen1.5-72B-Chat-GPTQ-Int4/summary)|q_proj, k_proj, v_proj|qwen|&#x2714;|&#x2714;|auto_gptq>=0.5, transformers>=4.37|
+|qwen1half-0_5b-chat-int8|[qwen/Qwen1.5-0.5B-Chat-GPTQ-Int8](https://modelscope.cn/models/qwen/Qwen1.5-0.5B-Chat-GPTQ-Int8/summary)|q_proj, k_proj, v_proj|qwen|&#x2714;|&#x2714;|auto_gptq>=0.5, transformers>=4.37|
+|qwen1half-1_8b-chat-int8|[qwen/Qwen1.5-1.8B-Chat-GPTQ-Int8](https://modelscope.cn/models/qwen/Qwen1.5-1.8B-Chat-GPTQ-Int8/summary)|q_proj, k_proj, v_proj|qwen|&#x2714;|&#x2714;|auto_gptq>=0.5, transformers>=4.37|
+|qwen1half-4b-chat-int8|[qwen/Qwen1.5-4B-Chat-GPTQ-Int8](https://modelscope.cn/models/qwen/Qwen1.5-4B-Chat-GPTQ-Int8/summary)|q_proj, k_proj, v_proj|qwen|&#x2714;|&#x2714;|auto_gptq>=0.5, transformers>=4.37|
+|qwen1half-7b-chat-int8|[qwen/Qwen1.5-7B-Chat-GPTQ-Int8](https://modelscope.cn/models/qwen/Qwen1.5-7B-Chat-GPTQ-Int8/summary)|q_proj, k_proj, v_proj|qwen|&#x2714;|&#x2714;|auto_gptq>=0.5, transformers>=4.37|
+|qwen1half-14b-chat-int8|[qwen/Qwen1.5-14B-Chat-GPTQ-Int8](https://modelscope.cn/models/qwen/Qwen1.5-14B-Chat-GPTQ-Int8/summary)|q_proj, k_proj, v_proj|qwen|&#x2714;|&#x2714;|auto_gptq>=0.5, transformers>=4.37|
+|qwen1half-72b-chat-int8|[qwen/Qwen1.5-72B-Chat-GPTQ-Int8](https://modelscope.cn/models/qwen/Qwen1.5-72B-Chat-GPTQ-Int8/summary)|q_proj, k_proj, v_proj|qwen|&#x2714;|&#x2714;|auto_gptq>=0.5, transformers>=4.37|
 |qwen-vl|[qwen/Qwen-VL](https://modelscope.cn/models/qwen/Qwen-VL/summary)|c_attn|default-generation|&#x2714;|&#x2718;||
 |qwen-vl-chat|[qwen/Qwen-VL-Chat](https://modelscope.cn/models/qwen/Qwen-VL-Chat/summary)|c_attn|qwen|&#x2714;|&#x2718;||
 |qwen-vl-chat-int4|[qwen/Qwen-VL-Chat-Int4](https://modelscope.cn/models/qwen/Qwen-VL-Chat-Int4/summary)|c_attn|qwen|&#x2714;|&#x2718;|auto_gptq>=0.5|
@@ -119,10 +119,10 @@
 |baichuan-13b-chat|[baichuan-inc/Baichuan-13B-Chat](https://modelscope.cn/models/baichuan-inc/Baichuan-13B-Chat/summary)|W_pack|baichuan|&#x2718;|&#x2714;|transformers<4.34|
 |baichuan2-7b|[baichuan-inc/Baichuan2-7B-Base](https://modelscope.cn/models/baichuan-inc/Baichuan2-7B-Base/summary)|W_pack|default-generation|&#x2718;|&#x2714;||
 |baichuan2-7b-chat|[baichuan-inc/Baichuan2-7B-Chat](https://modelscope.cn/models/baichuan-inc/Baichuan2-7B-Chat/summary)|W_pack|baichuan|&#x2718;|&#x2714;||
-|baichuan2-7b-chat-int4|[baichuan-inc/Baichuan2-7B-Chat-4bits](https://modelscope.cn/models/baichuan-inc/Baichuan2-7B-Chat-4bits/summary)|W_pack|baichuan|&#x2718;|&#x2718;||
+|baichuan2-7b-chat-int4|[baichuan-inc/Baichuan2-7B-Chat-4bits](https://modelscope.cn/models/baichuan-inc/Baichuan2-7B-Chat-4bits/summary)|W_pack|baichuan|&#x2718;|&#x2718;|bitsandbytes<0.41.2, accelerate<0.26|
 |baichuan2-13b|[baichuan-inc/Baichuan2-13B-Base](https://modelscope.cn/models/baichuan-inc/Baichuan2-13B-Base/summary)|W_pack|default-generation|&#x2718;|&#x2714;||
 |baichuan2-13b-chat|[baichuan-inc/Baichuan2-13B-Chat](https://modelscope.cn/models/baichuan-inc/Baichuan2-13B-Chat/summary)|W_pack|baichuan|&#x2718;|&#x2714;||
-|baichuan2-13b-chat-int4|[baichuan-inc/Baichuan2-13B-Chat-4bits](https://modelscope.cn/models/baichuan-inc/Baichuan2-13B-Chat-4bits/summary)|W_pack|baichuan|&#x2718;|&#x2718;||
+|baichuan2-13b-chat-int4|[baichuan-inc/Baichuan2-13B-Chat-4bits](https://modelscope.cn/models/baichuan-inc/Baichuan2-13B-Chat-4bits/summary)|W_pack|baichuan|&#x2718;|&#x2718;|bitsandbytes<0.41.2, accelerate<0.26|
 |yuan2-2b-instruct|[YuanLLM/Yuan2.0-2B-hf](https://modelscope.cn/models/YuanLLM/Yuan2.0-2B-hf/summary)|q_proj, k_proj, v_proj|yuan|&#x2714;|&#x2718;||
 |yuan2-2b-janus-instruct|[YuanLLM/Yuan2-2B-Janus-hf](https://modelscope.cn/models/YuanLLM/Yuan2-2B-Janus-hf/summary)|q_proj, k_proj, v_proj|yuan|&#x2714;|&#x2718;||
 |yuan2-51b-instruct|[YuanLLM/Yuan2.0-51B-hf](https://modelscope.cn/models/YuanLLM/Yuan2.0-51B-hf/summary)|q_proj, k_proj, v_proj|yuan|&#x2714;|&#x2718;||
diff --git a/scripts/utils/run_model_info.py b/scripts/utils/run_model_info.py
@@ -45,55 +45,5 @@ def get_model_info_table() -> List[str]:
     return res
 
 
-def get_model_info_readme_zh(data: List[str]) -> None:
-    fpath = 'README_CN.md'
-    with open(fpath, 'r') as f:
-        text = f.read()
-    start_idx = text.find('  - 多模态:')
-    end_idx = text.find('- 支持的数据集:')
-    text = text[start_idx:end_idx]
-    match_list = re.findall(r'- (.+)( 系列)?: (.+)', text)
-    model_list = []
-    for match in match_list:
-        model_list += match[2].strip('.').split(',')
-    model_list_2 = []
-    for model in model_list:
-        model = model.strip()
-        model_match = re.search(r'\[(.+)\]\(.+\)', model)
-        if model_match is not None:
-            model = model_match.group(1)
-        model_list_2.append(model)
-    model_list = model_list_2
-    model_type_list = [d[0] for d in data]
-    print(set(model_type_list) - set(model_list))
-    print(set(model_list) - set(model_type_list))
-
-
-def get_model_info_readme_en(data: List[str]) -> None:
-    fpath = 'README.md'
-    with open(fpath, 'r') as f:
-        raw_text = f.read()
-    start_idx = raw_text.find('  - Multi-Modal:')
-    end_idx = raw_text.find('- Supported Datasets:')
-    text = raw_text[start_idx:end_idx]
-    match_list = re.findall(r'- (.+)( series)?: (.+)', text)
-    model_list = []
-    for match in match_list:
-        model_list += match[2].strip('.').split(',')
-    model_list_2 = []
-    for model in model_list:
-        model = model.strip()
-        model_match = re.search(r'\[(.+)\]\(.+\)', model)
-        if model_match is not None:
-            model = model_match.group(1)
-        model_list_2.append(model)
-    model_list = model_list_2
-    model_type_list = [d[0] for d in data]
-    print(set(model_type_list) - set(model_list))
-    print(set(model_list) - set(model_type_list))
-
-
 if __name__ == '__main__':
     result = get_model_info_table()
-    result_en = get_model_info_readme_en(result)
-    result_zh = get_model_info_readme_zh(result)
diff --git a/swift/llm/deploy.py b/swift/llm/deploy.py
@@ -121,6 +121,9 @@ async def inference_vllm_async(request: Union[ChatCompletionRequest,
     tokenizer = template.tokenizer
     if tokenizer.eos_token is not None and tokenizer.eos_token not in generation_config.stop:
         generation_config.stop.append(tokenizer.eos_token)
+    if isinstance(template.suffix[-1],
+                  str) and template.suffix[-1] not in generation_config.stop:
+        generation_config.stop.append(template.suffix[-1])
     created_time = int(time.time())
     result_generator = llm_engine.generate(None, generation_config, request_id,
                                            input_ids)
diff --git a/swift/llm/utils/model.py b/swift/llm/utils/model.py
diff --git a/swift/llm/utils/vllm_utils.py b/swift/llm/utils/vllm_utils.py