From 1bcdbbbd6146eacaee78fa64bc209db3a06eb343 Mon Sep 17 00:00:00 2001 From: ImmarKarim <87947114+ImmarKarim@users.noreply.github.com> Date: Wed, 1 May 2024 22:29:33 +0500 Subject: [PATCH 1/5] Add truss example for Qwen1.5-110B with vllm & streaming support --- qwen/qwen-110B-chat/config.yaml | 19 ++++++++ qwen/qwen-110B-chat/model/__init__.py | 0 qwen/qwen-110B-chat/model/model.py | 64 +++++++++++++++++++++++++++ 3 files changed, 83 insertions(+) create mode 100644 qwen/qwen-110B-chat/config.yaml create mode 100644 qwen/qwen-110B-chat/model/__init__.py create mode 100644 qwen/qwen-110B-chat/model/model.py diff --git a/qwen/qwen-110B-chat/config.yaml b/qwen/qwen-110B-chat/config.yaml new file mode 100644 index 000000000..81d87f478 --- /dev/null +++ b/qwen/qwen-110B-chat/config.yaml @@ -0,0 +1,19 @@ +environment_variables: {CUDA_VISIBLE_DEVICES: "0,1,2,3"} +external_package_dirs: [] +model_metadata: + example_model_input: {"prompt": "How long would it take to reach the sun?"} +model_name: Qwen1.5-vllm-streaming +python_version: py310 +requirements: +- torch==2.1.2 +- transformers==4.37.0 +- vllm +- asyncio==3.4.3 +- ray +resources: + accelerator: A100 + cpu: '40' + memory: 100Gi + use_gpu: true +secrets: {} +system_packages: [] diff --git a/qwen/qwen-110B-chat/model/__init__.py b/qwen/qwen-110B-chat/model/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/qwen/qwen-110B-chat/model/model.py b/qwen/qwen-110B-chat/model/model.py new file mode 100644 index 000000000..f4efa7343 --- /dev/null +++ b/qwen/qwen-110B-chat/model/model.py @@ -0,0 +1,64 @@ +import subprocess +import uuid +from transformers import AutoTokenizer + +from vllm import SamplingParams +from vllm.engine.arg_utils import AsyncEngineArgs +from vllm.engine.async_llm_engine import AsyncLLMEngine + + +class Model: + def __init__(self, model_name="Qwen/Qwen1.5-110B-Chat"): + self.model_name = model_name + self.tokenizer = None + self.sampling_params = None + + command = "ray start --head" + subprocess.check_output(command, shell=True, text=True) + + def load(self): + self.model_args = AsyncEngineArgs( + model=self.model_name, + dtype='auto', + enforce_eager=True, + tensor_parallel_size=4 + + ) + + self.tokenizer = AutoTokenizer.from_pretrained(self.model_name) + + self.sampling_params = SamplingParams( # Using default values + temperature=0.7, + top_p=0.8, + repetition_penalty=1.05, + max_tokens=512 + ) + + self.llm_engine = AsyncLLMEngine.from_engine_args(self.model_args) + + async def predict(self, model_input): + message = model_input.pop("prompt") + + prompt = [ + {"role": "system", "content": "You are a helpful assistant."}, + {"role": "user", "content": message} + ] + + text = self.tokenizer.apply_chat_template( + prompt, + tokenize=False, + add_generation_prompt=True + ) + + idx = str(uuid.uuid4().hex) + vllm_generator = self.llm_engine.generate(text, self.sampling_params, idx) + + async def generator(): + full_text = "" + async for output in vllm_generator: + text = output.outputs[0].text + delta = text[len(full_text) :] + full_text = text + yield delta + + return generator() \ No newline at end of file From ecc007098911bc86d935a25ebb8ce7d4c237b889 Mon Sep 17 00:00:00 2001 From: ImmarKarim <87947114+ImmarKarim@users.noreply.github.com> Date: Fri, 3 May 2024 13:15:55 +0500 Subject: [PATCH 2/5] Changed the resources section - PR feedback --- qwen/qwen-110B-chat/config.yaml | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/qwen/qwen-110B-chat/config.yaml b/qwen/qwen-110B-chat/config.yaml index 81d87f478..e1118cd18 100644 --- a/qwen/qwen-110B-chat/config.yaml +++ b/qwen/qwen-110B-chat/config.yaml @@ -5,15 +5,13 @@ model_metadata: model_name: Qwen1.5-vllm-streaming python_version: py310 requirements: -- torch==2.1.2 -- transformers==4.37.0 -- vllm +- torch==2.2.1 +- transformers==4.40.0 +- vllm==0.4.1 - asyncio==3.4.3 - ray resources: - accelerator: A100 - cpu: '40' - memory: 100Gi + accelerator: A100:4 use_gpu: true secrets: {} system_packages: [] From 5c90be001818070de2ea7e395e8491fa55c43545 Mon Sep 17 00:00:00 2001 From: ImmarKarim <87947114+ImmarKarim@users.noreply.github.com> Date: Sat, 11 May 2024 00:29:26 +0500 Subject: [PATCH 3/5] Removed env from config --- qwen/qwen-110B-chat/config.yaml | 3 +-- qwen/qwen-110B-chat/model/model.py | 3 ++- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/qwen/qwen-110B-chat/config.yaml b/qwen/qwen-110B-chat/config.yaml index e1118cd18..9f75ae845 100644 --- a/qwen/qwen-110B-chat/config.yaml +++ b/qwen/qwen-110B-chat/config.yaml @@ -1,8 +1,7 @@ -environment_variables: {CUDA_VISIBLE_DEVICES: "0,1,2,3"} external_package_dirs: [] model_metadata: example_model_input: {"prompt": "How long would it take to reach the sun?"} -model_name: Qwen1.5-vllm-streaming +model_name: Qwen1.5-vllm-streaminggg python_version: py310 requirements: - torch==2.2.1 diff --git a/qwen/qwen-110B-chat/model/model.py b/qwen/qwen-110B-chat/model/model.py index f4efa7343..32fd14383 100644 --- a/qwen/qwen-110B-chat/model/model.py +++ b/qwen/qwen-110B-chat/model/model.py @@ -61,4 +61,5 @@ async def generator(): full_text = text yield delta - return generator() \ No newline at end of file + return generator() + From ede8d24e3ae90492cce6d87035e22effdb6a4bb5 Mon Sep 17 00:00:00 2001 From: ImmarKarim <87947114+ImmarKarim@users.noreply.github.com> Date: Sat, 11 May 2024 00:34:00 +0500 Subject: [PATCH 4/5] Revert "Removed env from config" This reverts commit 5c90be001818070de2ea7e395e8491fa55c43545. --- qwen/qwen-110B-chat/config.yaml | 3 ++- qwen/qwen-110B-chat/model/model.py | 3 +-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/qwen/qwen-110B-chat/config.yaml b/qwen/qwen-110B-chat/config.yaml index 9f75ae845..e1118cd18 100644 --- a/qwen/qwen-110B-chat/config.yaml +++ b/qwen/qwen-110B-chat/config.yaml @@ -1,7 +1,8 @@ +environment_variables: {CUDA_VISIBLE_DEVICES: "0,1,2,3"} external_package_dirs: [] model_metadata: example_model_input: {"prompt": "How long would it take to reach the sun?"} -model_name: Qwen1.5-vllm-streaminggg +model_name: Qwen1.5-vllm-streaming python_version: py310 requirements: - torch==2.2.1 diff --git a/qwen/qwen-110B-chat/model/model.py b/qwen/qwen-110B-chat/model/model.py index 32fd14383..f4efa7343 100644 --- a/qwen/qwen-110B-chat/model/model.py +++ b/qwen/qwen-110B-chat/model/model.py @@ -61,5 +61,4 @@ async def generator(): full_text = text yield delta - return generator() - + return generator() \ No newline at end of file From ee5a83a8720f064de164b0b19a8a6b72d82b3ee6 Mon Sep 17 00:00:00 2001 From: ImmarKarim <87947114+ImmarKarim@users.noreply.github.com> Date: Sat, 11 May 2024 00:41:46 +0500 Subject: [PATCH 5/5] Fixed all issues --- qwen/qwen-110B-chat/config.yaml | 1 - qwen/qwen-110B-chat/model/model.py | 3 ++- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/qwen/qwen-110B-chat/config.yaml b/qwen/qwen-110B-chat/config.yaml index e1118cd18..ed7daef4c 100644 --- a/qwen/qwen-110B-chat/config.yaml +++ b/qwen/qwen-110B-chat/config.yaml @@ -1,4 +1,3 @@ -environment_variables: {CUDA_VISIBLE_DEVICES: "0,1,2,3"} external_package_dirs: [] model_metadata: example_model_input: {"prompt": "How long would it take to reach the sun?"} diff --git a/qwen/qwen-110B-chat/model/model.py b/qwen/qwen-110B-chat/model/model.py index f4efa7343..a58fcd077 100644 --- a/qwen/qwen-110B-chat/model/model.py +++ b/qwen/qwen-110B-chat/model/model.py @@ -61,4 +61,5 @@ async def generator(): full_text = text yield delta - return generator() \ No newline at end of file + return generator() +