Add Qwen1.5-72B-chat (#44)

HaiHui886 · haihwang · web-flow · commit 600bdc1260fd · 2024-03-25T13:36:09.000+08:00
* push image to opencsg registry

* fix bug pad token

* remove cpu_nums and add time log

* add deep code model

* update

* update OpenCSG model name

* update deepseek parameters

* add Qwen1.5

* add Qwen 1.5 72B

---------

Co-authored-by: haihwang &lt;haihwang@cn.ibm.com&gt;
diff --git a/llmserve/backend/server/config.py b/llmserve/backend/server/config.py
@@ -131,7 +131,8 @@
     "opencsg/opencsg-starcoder-v0.1": "./models/text-generation--opencsg--opencsg-starcoder-15B-v0.1-pipeline.yaml",
     "OpenCSG/opencsg-starcoder-v0.1": "./models/text-generation--opencsg--opencsg-starcoder-15B-v0.1-pipeline.yaml",
     "opencsg/opencsg-deepseek-coder-1.3b-v0.1": "./models/text-generation--opencsg--opencsg-deepseek-coder-1.3b-v0.1.yaml",
-    "OpenCSG/opencsg-deepseek-coder-1.3b-v0.1": "./models/text-generation--opencsg--opencsg-deepseek-coder-1.3b-v0.1.yaml"
+    "OpenCSG/opencsg-deepseek-coder-1.3b-v0.1": "./models/text-generation--opencsg--opencsg-deepseek-coder-1.3b-v0.1.yaml",
+    "Qwen/Qwen1.5-72B-Chat": "./models/text-generation--Qwen--Qwen1.5-72B-Chat.yaml"
 }
 
 SERVE_RUN_HOST = "0.0.0.0"
diff --git a/models/text-generation--Qwen--Qwen1.5-72B-Chat.yaml b/models/text-generation--Qwen--Qwen1.5-72B-Chat.yaml
@@ -0,0 +1,49 @@
+deployment_config:
+  autoscaling_config:
+    min_replicas: 1
+    initial_replicas: 1
+    max_replicas: 1
+    target_num_ongoing_requests_per_replica: 1.0
+    metrics_interval_s: 10.0
+    look_back_period_s: 30.0
+    smoothing_factor: 1.0
+    downscale_delay_s: 300.0
+    upscale_delay_s: 90.0
+  ray_actor_options:
+    num_cpus: 2    # for a model deployment, we have 3 actor created, 1 and 2 will cost 0.1 cpu, and the model infrence will cost 6(see the setting in the end of the file)
+model_config:
+  warmup: False
+  model_task: text-generation
+  model_id: Qwen/Qwen1.5-72B-Chat
+  max_input_words: 800
+  initialization:
+    s3_mirror_config:
+      bucket_uri: /data/models/Qwen1.5-72B-Chat/
+    initializer:
+      type: DeviceMap
+      dtype: float16
+      from_pretrained_kwargs:
+        use_cache: true
+        trust_remote_code: true
+      # use_kernel: true   # for deepspped type only
+      # max_tokens: 1536   # for deepspped type only
+    pipeline: defaulttransformers
+    # pipeline: default
+  generation:
+    max_batch_size: 1
+    generate_kwargs:
+      bos_token_id: 151643,
+      # pad_token_id: 151643,
+      # eos_token_id: [151645, 151643],
+      do_sample: false
+      max_new_tokens: 512
+      repetition_penalty: 1.05
+      temperature: 0.7
+      top_p: 0.8
+      top_k: 20
+    prompt_format: "'role': 'user', 'content': {instruction}"
+    # stopping_sequences: ["### Response:", "### End"]
+scaling_config:
+  num_workers: 1
+  num_gpus_per_worker: 7
+  num_cpus_per_worker: 32  # for inference

Original file line number	Diff line number	Diff line change
`@@ -131,7 +131,8 @@`
`131`	`131`	`"opencsg/opencsg-starcoder-v0.1": "./models/text-generation--opencsg--opencsg-starcoder-15B-v0.1-pipeline.yaml",`
`132`	`132`	`"OpenCSG/opencsg-starcoder-v0.1": "./models/text-generation--opencsg--opencsg-starcoder-15B-v0.1-pipeline.yaml",`
`133`	`133`	`"opencsg/opencsg-deepseek-coder-1.3b-v0.1": "./models/text-generation--opencsg--opencsg-deepseek-coder-1.3b-v0.1.yaml",`
`134`		`- "OpenCSG/opencsg-deepseek-coder-1.3b-v0.1": "./models/text-generation--opencsg--opencsg-deepseek-coder-1.3b-v0.1.yaml"`
	`134`	`+ "OpenCSG/opencsg-deepseek-coder-1.3b-v0.1": "./models/text-generation--opencsg--opencsg-deepseek-coder-1.3b-v0.1.yaml",`
	`135`	`+ "Qwen/Qwen1.5-72B-Chat": "./models/text-generation--Qwen--Qwen1.5-72B-Chat.yaml"`
`135`	`136`	`}`
`136`	`137`
`137`	`138`	`SERVE_RUN_HOST = "0.0.0.0"`