You signed in with another tab or window. Reload to refresh your session.You signed out in another tab or window. Reload to refresh your session.You switched accounts on another tab or window. Reload to refresh your session.Dismiss alert
num_cpus: 2# for a model deployment, we have 3 actor created, 1 and 2 will cost 0.1 cpu, and the model inference will cost 6(see the setting in the end of the file)
14
+
model_config:
15
+
# stream: True
16
+
warmup: True
17
+
model_task: text-generation
18
+
model_id: Qwen/Qwen1.5-72B-Chat-GGUF
19
+
max_input_words: 512
20
+
initialization:
21
+
# s3_mirror_config:
22
+
# bucket_uri: /data/models/Qwen1.5-72B-Chat-GGUF/
23
+
initializer:
24
+
type: LlamaCpp
25
+
model_filename: qwen1_5-72b-chat-q5_k_m.gguf
26
+
model_init_kwargs:
27
+
test: true
28
+
n_gpu_layers: -1
29
+
pipeline: llamacpp
30
+
generation:
31
+
max_batch_size: 1
32
+
batch_wait_timeout_s: 0
33
+
generate_kwargs:
34
+
max_tokens: 512
35
+
prompt_format: '[{{"role": "system", "content": "You are a helpful assistant."}},{{"role": "user", "content": "{instruction}"}}]'
0 commit comments