Skip to content

Commit 27e1e76

Browse files
authored
add vllm lmdeploy benchmark (#1889)
1 parent d1efdb7 commit 27e1e76

File tree

7 files changed

+82
-6
lines changed

7 files changed

+82
-6
lines changed

docs/source/LLM/LmDeploy推理加速与部署.md

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -181,5 +181,7 @@ CUDA_VISIBLE_DEVICES=0,1 swift deploy --model_type qwen2-72b-instruct --infer_ba
181181

182182
客户端调用方式可以查看: [vLLM推理加速与部署文档](VLLM推理加速与部署.md#部署)
183183

184+
benchmark测试代码: https://github.com/modelscope/ms-swift/blob/main/scripts/benchmark/deploy.py
185+
184186
## 多模态
185187
查看[这里](../Multi-Modal/LmDeploy推理加速文档.md)

docs/source/LLM/VLLM推理加速与部署.md

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -234,6 +234,8 @@ swift使用VLLM作为推理后端, 并兼容openai的API样式.
234234

235235
客户端的openai的API参数可以参考: https://platform.openai.com/docs/api-reference/introduction.
236236

237+
benchmark测试代码: https://github.com/modelscope/ms-swift/blob/main/scripts/benchmark/deploy.py
238+
237239
### 原始模型
238240
#### qwen-7b-chat
239241

docs/source_en/LLM/LmDeploy-inference-acceleration-and-deployment.md

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -110,5 +110,7 @@ CUDA_VISIBLE_DEVICES=0,1 swift deploy --model_type qwen2-72b-instruct --infer_ba
110110

111111
The method for client invocation can be found in: [vLLM Inference Acceleration and Deployment Documentation](VLLM-inference-acceleration-and-deployment.md#deployment).
112112

113+
Benchmark testing code: https://github.com/modelscope/ms-swift/blob/main/scripts/benchmark/deploy.py
114+
113115
## Multimodal
114116
Check [here](../Multi-Modal/LmDeploy-inference-acceleration-and-deployment.md)

docs/source_en/LLM/VLLM-inference-acceleration-and-deployment.md

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -174,6 +174,8 @@ For server deployment command line arguments, refer to: [deploy command line arg
174174

175175
For OpenAI API arguments on the client side, refer to: https://platform.openai.com/docs/api-reference/introduction.
176176

177+
Benchmark testing code: https://github.com/modelscope/ms-swift/blob/main/scripts/benchmark/deploy.py
178+
177179
### Original Models
178180
#### qwen-7b-chat
179181

scripts/benchmark/deploy.py

Lines changed: 62 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,62 @@
1+
def test_benchmark(infer_backend):
2+
import os
3+
os.environ['CUDA_VISIBLE_DEVICES'] = '0'
4+
os.environ['TIMEOUT'] = '-1'
5+
import requests
6+
from swift.llm import DeployArguments, get_dataset, get_model_list_client, XRequestConfig, inference_client_async
7+
from swift.llm.deploy import llm_deploy
8+
import multiprocessing
9+
import time
10+
import asyncio
11+
from swift.utils import get_logger
12+
13+
logger = get_logger()
14+
15+
mp = multiprocessing.get_context('spawn')
16+
process = mp.Process(
17+
target=llm_deploy,
18+
args=(DeployArguments(model_type='qwen2-7b-instruct', infer_backend=infer_backend, verbose=False), ))
19+
process.start()
20+
21+
dataset = get_dataset(['alpaca-zh#1000', 'alpaca-en#1000'])[0]
22+
query_list = dataset['query']
23+
request_config = XRequestConfig(seed=42, max_tokens=8192)
24+
25+
while True:
26+
try:
27+
model_list = get_model_list_client()
28+
except requests.exceptions.ConnectionError:
29+
time.sleep(5)
30+
continue
31+
break
32+
model_type = model_list.data[0].id
33+
is_chat = model_list.data[0].is_chat
34+
is_multimodal = model_list.data[0].is_multimodal
35+
print(f'model_type: {model_type}')
36+
37+
tasks = []
38+
for query in query_list:
39+
tasks.append(
40+
inference_client_async(
41+
model_type, query, request_config=request_config, is_chat=is_chat, is_multimodal=is_multimodal))
42+
43+
async def _batch_run(tasks):
44+
return await asyncio.gather(*tasks)
45+
46+
resp_list = asyncio.run(_batch_run(tasks))
47+
logger.info(f'len(resp_list): {len(resp_list)}')
48+
logger.info(f'resp_list[0]: {resp_list[0]}')
49+
process.terminate()
50+
51+
52+
def test_vllm_benchmark():
53+
test_benchmark('vllm')
54+
55+
56+
def test_lmdeploy_benchmark():
57+
test_benchmark('lmdeploy')
58+
59+
60+
if __name__ == '__main__':
61+
# test_vllm_benchmark()
62+
test_lmdeploy_benchmark()

swift/llm/utils/client_utils.py

Lines changed: 8 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -19,11 +19,12 @@
1919

2020

2121
def _get_request_kwargs(api_key: Optional[str] = None) -> Dict[str, Any]:
22-
timeout = float(os.getenv('TIMEOUT', '60'))
23-
request_kwargs = {'timeout': timeout}
24-
if api_key is None:
25-
return request_kwargs
26-
request_kwargs['headers'] = {'Authorization': f'Bearer {api_key}'}
22+
timeout = float(os.getenv('TIMEOUT', '300'))
23+
request_kwargs = {}
24+
if timeout > 0:
25+
request_kwargs['timeout'] = timeout
26+
if api_key is not None:
27+
request_kwargs['headers'] = {'Authorization': f'Bearer {api_key}'}
2728
return request_kwargs
2829

2930

@@ -280,6 +281,7 @@ def inference_client(
280281
if request_config is None:
281282
request_config = XRequestConfig()
282283
model_list = None
284+
is_chat_request = is_chat_request or kwargs.get('is_chat')
283285
if is_chat_request is None or is_multimodal is None:
284286
model_list = get_model_list_client(host, port, api_key=api_key, **kwargs)
285287

@@ -350,6 +352,7 @@ async def inference_client_async(
350352
if request_config is None:
351353
request_config = XRequestConfig()
352354
model_list = None
355+
is_chat_request = is_chat_request or kwargs.get('is_chat')
353356
if is_chat_request is None or is_multimodal is None:
354357
model_list = await get_model_list_client_async(host, port, api_key=api_key, **kwargs)
355358

swift/llm/utils/vision_utils.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -100,8 +100,11 @@ def load_file(path: Union[str, _T]) -> Union[BytesIO, _T]:
100100
if isinstance(path, str):
101101
path = path.strip()
102102
if path.startswith('http'):
103+
request_kwargs = {}
103104
timeout = float(os.getenv('TIMEOUT', '60'))
104-
content = requests.get(path, timeout=timeout).content
105+
if timeout > 0:
106+
request_kwargs['timeout'] = timeout
107+
content = requests.get(path, **request_kwargs).content
105108
res = BytesIO(content)
106109
elif os.path.exists(path):
107110
with open(path, 'rb') as f:

0 commit comments

Comments
 (0)