From 1bcdbbbd6146eacaee78fa64bc209db3a06eb343 Mon Sep 17 00:00:00 2001
From: ImmarKarim <87947114+ImmarKarim@users.noreply.github.com>
Date: Wed, 1 May 2024 22:29:33 +0500
Subject: [PATCH 1/5] Add truss example for Qwen1.5-110B with vllm & streaming
 support

---
 qwen/qwen-110B-chat/config.yaml       | 19 ++++++++
 qwen/qwen-110B-chat/model/__init__.py |  0
 qwen/qwen-110B-chat/model/model.py    | 64 +++++++++++++++++++++++++++
 3 files changed, 83 insertions(+)
 create mode 100644 qwen/qwen-110B-chat/config.yaml
 create mode 100644 qwen/qwen-110B-chat/model/__init__.py
 create mode 100644 qwen/qwen-110B-chat/model/model.py

diff --git a/qwen/qwen-110B-chat/config.yaml b/qwen/qwen-110B-chat/config.yaml
new file mode 100644
index 000000000..81d87f478
--- /dev/null
+++ b/qwen/qwen-110B-chat/config.yaml
@@ -0,0 +1,19 @@
+environment_variables: {CUDA_VISIBLE_DEVICES: "0,1,2,3"}
+external_package_dirs: []
+model_metadata:
+  example_model_input: {"prompt": "How long would it take to reach the sun?"}
+model_name: Qwen1.5-vllm-streaming
+python_version: py310
+requirements:
+- torch==2.1.2
+- transformers==4.37.0
+- vllm
+- asyncio==3.4.3
+- ray
+resources:
+  accelerator: A100
+  cpu: '40'
+  memory: 100Gi
+  use_gpu: true
+secrets: {}
+system_packages: []
diff --git a/qwen/qwen-110B-chat/model/__init__.py b/qwen/qwen-110B-chat/model/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/qwen/qwen-110B-chat/model/model.py b/qwen/qwen-110B-chat/model/model.py
new file mode 100644
index 000000000..f4efa7343
--- /dev/null
+++ b/qwen/qwen-110B-chat/model/model.py
@@ -0,0 +1,64 @@
+import subprocess
+import uuid
+from transformers import AutoTokenizer
+
+from vllm import SamplingParams
+from vllm.engine.arg_utils import AsyncEngineArgs
+from vllm.engine.async_llm_engine import AsyncLLMEngine
+
+
+class Model:
+    def __init__(self, model_name="Qwen/Qwen1.5-110B-Chat"):
+        self.model_name = model_name
+        self.tokenizer = None
+        self.sampling_params = None
+
+        command = "ray start --head"
+        subprocess.check_output(command, shell=True, text=True)
+
+    def load(self):
+        self.model_args = AsyncEngineArgs(
+            model=self.model_name,
+            dtype='auto',
+            enforce_eager=True,
+            tensor_parallel_size=4
+
+        )
+
+        self.tokenizer = AutoTokenizer.from_pretrained(self.model_name)
+
+        self.sampling_params = SamplingParams(    # Using default values
+            temperature=0.7,
+            top_p=0.8,
+            repetition_penalty=1.05,
+            max_tokens=512
+        )
+
+        self.llm_engine = AsyncLLMEngine.from_engine_args(self.model_args)
+
+    async def predict(self, model_input):
+        message = model_input.pop("prompt")
+
+        prompt = [
+            {"role": "system", "content": "You are a helpful assistant."},
+            {"role": "user", "content": message}
+        ]
+
+        text = self.tokenizer.apply_chat_template(
+            prompt,
+            tokenize=False,
+            add_generation_prompt=True
+        )
+
+        idx = str(uuid.uuid4().hex)
+        vllm_generator = self.llm_engine.generate(text, self.sampling_params, idx)
+
+        async def generator():
+            full_text = ""
+            async for output in vllm_generator:
+                text = output.outputs[0].text
+                delta = text[len(full_text) :]
+                full_text = text
+                yield delta
+
+        return generator()
\ No newline at end of file

From ecc007098911bc86d935a25ebb8ce7d4c237b889 Mon Sep 17 00:00:00 2001
From: ImmarKarim <87947114+ImmarKarim@users.noreply.github.com>
Date: Fri, 3 May 2024 13:15:55 +0500
Subject: [PATCH 2/5] Changed the resources section - PR feedback

---
 qwen/qwen-110B-chat/config.yaml | 10 ++++------
 1 file changed, 4 insertions(+), 6 deletions(-)

diff --git a/qwen/qwen-110B-chat/config.yaml b/qwen/qwen-110B-chat/config.yaml
index 81d87f478..e1118cd18 100644
--- a/qwen/qwen-110B-chat/config.yaml
+++ b/qwen/qwen-110B-chat/config.yaml
@@ -5,15 +5,13 @@ model_metadata:
 model_name: Qwen1.5-vllm-streaming
 python_version: py310
 requirements:
-- torch==2.1.2
-- transformers==4.37.0
-- vllm
+- torch==2.2.1
+- transformers==4.40.0
+- vllm==0.4.1
 - asyncio==3.4.3
 - ray
 resources:
-  accelerator: A100
-  cpu: '40'
-  memory: 100Gi
+  accelerator: A100:4
   use_gpu: true
 secrets: {}
 system_packages: []

From 5c90be001818070de2ea7e395e8491fa55c43545 Mon Sep 17 00:00:00 2001
From: ImmarKarim <87947114+ImmarKarim@users.noreply.github.com>
Date: Sat, 11 May 2024 00:29:26 +0500
Subject: [PATCH 3/5] Removed env from config

---
 qwen/qwen-110B-chat/config.yaml    | 3 +--
 qwen/qwen-110B-chat/model/model.py | 3 ++-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/qwen/qwen-110B-chat/config.yaml b/qwen/qwen-110B-chat/config.yaml
index e1118cd18..9f75ae845 100644
--- a/qwen/qwen-110B-chat/config.yaml
+++ b/qwen/qwen-110B-chat/config.yaml
@@ -1,8 +1,7 @@
-environment_variables: {CUDA_VISIBLE_DEVICES: "0,1,2,3"}
 external_package_dirs: []
 model_metadata:
   example_model_input: {"prompt": "How long would it take to reach the sun?"}
-model_name: Qwen1.5-vllm-streaming
+model_name: Qwen1.5-vllm-streaminggg
 python_version: py310
 requirements:
 - torch==2.2.1
diff --git a/qwen/qwen-110B-chat/model/model.py b/qwen/qwen-110B-chat/model/model.py
index f4efa7343..32fd14383 100644
--- a/qwen/qwen-110B-chat/model/model.py
+++ b/qwen/qwen-110B-chat/model/model.py
@@ -61,4 +61,5 @@ async def generator():
                 full_text = text
                 yield delta
 
-        return generator()
\ No newline at end of file
+        return generator()
+    

From ede8d24e3ae90492cce6d87035e22effdb6a4bb5 Mon Sep 17 00:00:00 2001
From: ImmarKarim <87947114+ImmarKarim@users.noreply.github.com>
Date: Sat, 11 May 2024 00:34:00 +0500
Subject: [PATCH 4/5] Revert "Removed env from config"

This reverts commit 5c90be001818070de2ea7e395e8491fa55c43545.
---
 qwen/qwen-110B-chat/config.yaml    | 3 ++-
 qwen/qwen-110B-chat/model/model.py | 3 +--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/qwen/qwen-110B-chat/config.yaml b/qwen/qwen-110B-chat/config.yaml
index 9f75ae845..e1118cd18 100644
--- a/qwen/qwen-110B-chat/config.yaml
+++ b/qwen/qwen-110B-chat/config.yaml
@@ -1,7 +1,8 @@
+environment_variables: {CUDA_VISIBLE_DEVICES: "0,1,2,3"}
 external_package_dirs: []
 model_metadata:
   example_model_input: {"prompt": "How long would it take to reach the sun?"}
-model_name: Qwen1.5-vllm-streaminggg
+model_name: Qwen1.5-vllm-streaming
 python_version: py310
 requirements:
 - torch==2.2.1
diff --git a/qwen/qwen-110B-chat/model/model.py b/qwen/qwen-110B-chat/model/model.py
index 32fd14383..f4efa7343 100644
--- a/qwen/qwen-110B-chat/model/model.py
+++ b/qwen/qwen-110B-chat/model/model.py
@@ -61,5 +61,4 @@ async def generator():
                 full_text = text
                 yield delta
 
-        return generator()
-    
+        return generator()
\ No newline at end of file

From ee5a83a8720f064de164b0b19a8a6b72d82b3ee6 Mon Sep 17 00:00:00 2001
From: ImmarKarim <87947114+ImmarKarim@users.noreply.github.com>
Date: Sat, 11 May 2024 00:41:46 +0500
Subject: [PATCH 5/5] Fixed all issues

---
 qwen/qwen-110B-chat/config.yaml    | 1 -
 qwen/qwen-110B-chat/model/model.py | 3 ++-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/qwen/qwen-110B-chat/config.yaml b/qwen/qwen-110B-chat/config.yaml
index e1118cd18..ed7daef4c 100644
--- a/qwen/qwen-110B-chat/config.yaml
+++ b/qwen/qwen-110B-chat/config.yaml
@@ -1,4 +1,3 @@
-environment_variables: {CUDA_VISIBLE_DEVICES: "0,1,2,3"}
 external_package_dirs: []
 model_metadata:
   example_model_input: {"prompt": "How long would it take to reach the sun?"}
diff --git a/qwen/qwen-110B-chat/model/model.py b/qwen/qwen-110B-chat/model/model.py
index f4efa7343..a58fcd077 100644
--- a/qwen/qwen-110B-chat/model/model.py
+++ b/qwen/qwen-110B-chat/model/model.py
@@ -61,4 +61,5 @@ async def generator():
                 full_text = text
                 yield delta
 
-        return generator()
\ No newline at end of file
+        return generator()
+