add vllm lmdeploy benchmark (#1889)

Jintao-Huang · web-flow · commit 27e1e76a6909 · 2024-09-02T12:44:02.000+08:00
diff --git a/docs/source/LLM/LmDeploy推理加速与部署.md b/docs/source/LLM/LmDeploy推理加速与部署.md
@@ -181,5 +181,7 @@ CUDA_VISIBLE_DEVICES=0,1 swift deploy --model_type qwen2-72b-instruct --infer_ba
 
 客户端调用方式可以查看: [vLLM推理加速与部署文档](VLLM推理加速与部署.md#部署)
 
+benchmark测试代码: https://github.com/modelscope/ms-swift/blob/main/scripts/benchmark/deploy.py
+
 ## 多模态
 查看[这里](../Multi-Modal/LmDeploy推理加速文档.md)
diff --git a/docs/source/LLM/VLLM推理加速与部署.md b/docs/source/LLM/VLLM推理加速与部署.md
@@ -234,6 +234,8 @@ swift使用VLLM作为推理后端, 并兼容openai的API样式.
 
 客户端的openai的API参数可以参考: https://platform.openai.com/docs/api-reference/introduction.
 
+benchmark测试代码: https://github.com/modelscope/ms-swift/blob/main/scripts/benchmark/deploy.py
+
 ### 原始模型
 #### qwen-7b-chat
 
diff --git a/docs/source_en/LLM/LmDeploy-inference-acceleration-and-deployment.md b/docs/source_en/LLM/LmDeploy-inference-acceleration-and-deployment.md
@@ -110,5 +110,7 @@ CUDA_VISIBLE_DEVICES=0,1 swift deploy --model_type qwen2-72b-instruct --infer_ba
 
 The method for client invocation can be found in: [vLLM Inference Acceleration and Deployment Documentation](VLLM-inference-acceleration-and-deployment.md#deployment).
 
+Benchmark testing code: https://github.com/modelscope/ms-swift/blob/main/scripts/benchmark/deploy.py
+
 ## Multimodal
 Check [here](../Multi-Modal/LmDeploy-inference-acceleration-and-deployment.md)
diff --git a/docs/source_en/LLM/VLLM-inference-acceleration-and-deployment.md b/docs/source_en/LLM/VLLM-inference-acceleration-and-deployment.md
@@ -174,6 +174,8 @@ For server deployment command line arguments, refer to: [deploy command line arg
 
 For OpenAI API arguments on the client side, refer to: https://platform.openai.com/docs/api-reference/introduction.
 
+Benchmark testing code: https://github.com/modelscope/ms-swift/blob/main/scripts/benchmark/deploy.py
+
 ### Original Models
 #### qwen-7b-chat
 
diff --git a/scripts/benchmark/deploy.py b/scripts/benchmark/deploy.py
@@ -0,0 +1,62 @@
+def test_benchmark(infer_backend):
+    import os
+    os.environ['CUDA_VISIBLE_DEVICES'] = '0'
+    os.environ['TIMEOUT'] = '-1'
+    import requests
+    from swift.llm import DeployArguments, get_dataset, get_model_list_client, XRequestConfig, inference_client_async
+    from swift.llm.deploy import llm_deploy
+    import multiprocessing
+    import time
+    import asyncio
+    from swift.utils import get_logger
+
+    logger = get_logger()
+
+    mp = multiprocessing.get_context('spawn')
+    process = mp.Process(
+        target=llm_deploy,
+        args=(DeployArguments(model_type='qwen2-7b-instruct', infer_backend=infer_backend, verbose=False), ))
+    process.start()
+
+    dataset = get_dataset(['alpaca-zh#1000', 'alpaca-en#1000'])[0]
+    query_list = dataset['query']
+    request_config = XRequestConfig(seed=42, max_tokens=8192)
+
+    while True:
+        try:
+            model_list = get_model_list_client()
+        except requests.exceptions.ConnectionError:
+            time.sleep(5)
+            continue
+        break
+    model_type = model_list.data[0].id
+    is_chat = model_list.data[0].is_chat
+    is_multimodal = model_list.data[0].is_multimodal
+    print(f'model_type: {model_type}')
+
+    tasks = []
+    for query in query_list:
+        tasks.append(
+            inference_client_async(
+                model_type, query, request_config=request_config, is_chat=is_chat, is_multimodal=is_multimodal))
+
+    async def _batch_run(tasks):
+        return await asyncio.gather(*tasks)
+
+    resp_list = asyncio.run(_batch_run(tasks))
+    logger.info(f'len(resp_list): {len(resp_list)}')
+    logger.info(f'resp_list[0]: {resp_list[0]}')
+    process.terminate()
+
+
+def test_vllm_benchmark():
+    test_benchmark('vllm')
+
+
+def test_lmdeploy_benchmark():
+    test_benchmark('lmdeploy')
+
+
+if __name__ == '__main__':
+    # test_vllm_benchmark()
+    test_lmdeploy_benchmark()
diff --git a/swift/llm/utils/client_utils.py b/swift/llm/utils/client_utils.py
@@ -19,11 +19,12 @@
 
 
 def _get_request_kwargs(api_key: Optional[str] = None) -> Dict[str, Any]:
-    timeout = float(os.getenv('TIMEOUT', '60'))
-    request_kwargs = {'timeout': timeout}
-    if api_key is None:
-        return request_kwargs
-    request_kwargs['headers'] = {'Authorization': f'Bearer {api_key}'}
+    timeout = float(os.getenv('TIMEOUT', '300'))
+    request_kwargs = {}
+    if timeout > 0:
+        request_kwargs['timeout'] = timeout
+    if api_key is not None:
+        request_kwargs['headers'] = {'Authorization': f'Bearer {api_key}'}
     return request_kwargs
 
 
@@ -280,6 +281,7 @@ def inference_client(
     if request_config is None:
         request_config = XRequestConfig()
     model_list = None
+    is_chat_request = is_chat_request or kwargs.get('is_chat')
     if is_chat_request is None or is_multimodal is None:
         model_list = get_model_list_client(host, port, api_key=api_key, **kwargs)
 
@@ -350,6 +352,7 @@ async def inference_client_async(
     if request_config is None:
         request_config = XRequestConfig()
     model_list = None
+    is_chat_request = is_chat_request or kwargs.get('is_chat')
     if is_chat_request is None or is_multimodal is None:
         model_list = await get_model_list_client_async(host, port, api_key=api_key, **kwargs)
 
diff --git a/swift/llm/utils/vision_utils.py b/swift/llm/utils/vision_utils.py
@@ -100,8 +100,11 @@ def load_file(path: Union[str, _T]) -> Union[BytesIO, _T]:
     if isinstance(path, str):
         path = path.strip()
         if path.startswith('http'):
+            request_kwargs = {}
             timeout = float(os.getenv('TIMEOUT', '60'))
-            content = requests.get(path, timeout=timeout).content
+            if timeout > 0:
+                request_kwargs['timeout'] = timeout
+            content = requests.get(path, **request_kwargs).content
             res = BytesIO(content)
         elif os.path.exists(path):
             with open(path, 'rb') as f: