Skip to content

Commit db54d53

Browse files
authored
add Qwen1.5-72B-GGUF yaml and fix load json input error (#71)
* add Qwen1.5-72B-GGUF yaml and fix load json input error * add logs * check prompt format * change function name of llamacpp * check prompt format * update parameters * update comments * update * update doc * update log * update * add Qwen1.5-72B-GGUF * remove echo * add echo * comment log * remove echo * fix stream
1 parent a956b41 commit db54d53

File tree

7 files changed

+121
-14
lines changed

7 files changed

+121
-14
lines changed

docs/common_issues.md

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,16 @@
44

55
Transformer version should be 4.33.3.
66

7+
## Use Llamacpp with GPU
8+
9+
By default `llama-cpp-python` was installed without GPU support.
10+
11+
Refer repo [llama-cpp-python](https://github.com/abetlen/llama-cpp-python) to reinstall package `llama-cpp-python` for GPU support
12+
13+
Links for GPU:
14+
- https://github.com/abetlen/llama-cpp-python/issues/509#issuecomment-1739098588
15+
- https://github.com/abetlen/llama-cpp-python/issues/627#issuecomment-1722495987
16+
717
## Launch model by Ray Job API
818

919
```

llmserve/backend/llm/engines/generic.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -120,7 +120,7 @@ def init_model(
120120
logger.info("start to test with single prompt")
121121
logger.info(f"warmpup prompt is: {warmup_inputs}")
122122
resp = generate(
123-
[Prompt(prompt=warmup_inputs, use_prompt_format=False)],
123+
[Prompt(prompt=warmup_inputs, use_prompt_format=True)],
124124
pipeline,
125125
**generate_kwargs,
126126
)
@@ -287,7 +287,7 @@ def ping(self) -> bool:
287287
async def worker_stream_generate_texts(self, prompt: str, **kwargs) -> Generator[str, None, None]: # type: ignore
288288
logger.info(f"Call PredictionWorker.worker_stream_generate_texts with kwargs: {kwargs}")
289289
for s in self.generator.streamGenerate(prompt, **kwargs):
290-
logger.info(f"PredictionWorker.worker_stream_generate_texts -> yield ->{s}")
290+
# logger.info(f"PredictionWorker.worker_stream_generate_texts -> yield ->{s}")
291291
yield s
292292

293293
class GenericEngine(LLMEngine):
@@ -375,7 +375,7 @@ def slice_prompts(worker_num: int, worker_index: int, prompts: list[str]):
375375
else:
376376
return prompts[slice_size * worker_index: slice_size * worker_index + slice_size]
377377

378-
logger.info('LLM Predictor do async predict')
378+
logger.info('LLM GenericEngine do async predict')
379379

380380
async with lock:
381381
# prediction = (

llmserve/backend/llm/initializers/llamacpp.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -65,6 +65,7 @@ def __init__(
6565
self.model_init_kwargs = model_init_kwargs
6666

6767
def _get_model_init_kwargs(self) -> Dict[str, Any]:
68+
logger.info(f"model_init_kwargs: {self.model_init_kwargs}")
6869
return {
6970
# -1 means all layers are offloaded to GPU
7071
"n_gpu_layers": 0 if self.device.type == "cpu" else -1,
@@ -75,10 +76,10 @@ def _get_model_init_kwargs(self) -> Dict[str, Any]:
7576
}
7677

7778
def load_model(self, model_id: str) -> "Llama":
78-
logger.info(
79-
f"LlamaCppInitializer downloading {model_id} : {self.model_filename}")
79+
logger.info(f"LlamaCppInitializer downloading {model_id} : {self.model_filename}")
8080
model_path = hf_hub_download(model_id, self.model_filename)
8181
logger.info(f"LlamaCppInitializer Loading model {model_path}")
82+
logger.info(f"model_init_kwargs: {self._get_model_init_kwargs()}")
8283
# Lazy import to avoid issues on CPU head node
8384
from llama_cpp import Llama
8485

llmserve/backend/llm/pipelines/llamacpp/llamacpp_pipeline.py

Lines changed: 59 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,12 @@
1111
from ..utils import decode_stopping_sequences_where_needed, construct_prompts
1212
import json
1313

14+
from typing import Generator
15+
from transformers import TextIteratorStreamer
16+
from threading import Thread
17+
from queue import Empty
18+
import asyncio
19+
1420
if TYPE_CHECKING:
1521
from llama_cpp import Llama, LogitsProcessorList, StoppingCriteriaList
1622

@@ -100,11 +106,10 @@ def _add_default_generate_kwargs(
100106
return generate_kwargs
101107

102108
def __call__(self, inputs: List[str], **kwargs) -> List[Response]:
103-
logger.info(inputs)
104-
inputs = construct_prompts(
105-
inputs, prompt_format=self.prompt_format)
106-
107-
logger.info(inputs)
109+
logger.info(f"prompt_format: {self.prompt_format}")
110+
logger.info(f"before construct_prompts: {inputs}")
111+
inputs = construct_prompts(inputs, prompt_format=self.prompt_format)
112+
logger.info(f"after construct_prompts: {inputs}")
108113

109114
tokenized_inputs = self.tokenizer.encode(inputs)
110115
kwargs = self._add_default_generate_kwargs(
@@ -116,9 +121,10 @@ def __call__(self, inputs: List[str], **kwargs) -> List[Response]:
116121
chat_completion = False
117122
try:
118123
inputs_bak = inputs
119-
inputs = [json.loads(prompt) for prompt in inputs]
124+
inputs = [json.loads(prompt, strict=False) for prompt in inputs]
120125
chat_completion = True
121-
except:
126+
except Exception as ex:
127+
logger.error(f"Exception apply_chat_template: {ex}")
122128
logger.info("Seems no chat template from user")
123129
inputs = inputs_bak
124130

@@ -218,3 +224,49 @@ def from_initializer(
218224
device=device,
219225
**kwargs,
220226
)
227+
228+
def streamGenerate(self, prompt: str, **generate_kwargs) -> Generator[str, None, None]:
229+
logger.info(f"stream prompt: {prompt}")
230+
inputs = construct_prompts(prompt, prompt_format=self.prompt_format)
231+
logger.info(f"stream inputs: {inputs}")
232+
chat_completion = False
233+
try:
234+
inputs_bak = inputs
235+
inputs = [json.loads(prompt, strict=False) for prompt in inputs]
236+
chat_completion = True
237+
except Exception as ex:
238+
logger.error(f"Exception apply_chat_template: {ex}")
239+
logger.info("Seems no chat template from user")
240+
inputs = inputs_bak
241+
242+
logger.info(f"stream generate_kwargs: {generate_kwargs}")
243+
logger.info(f"model inputs: {inputs}")
244+
245+
if chat_completion:
246+
generate_kwargs.pop('stopping_sequences', None)
247+
logger.info(f"chat generate_kwargs: {generate_kwargs}")
248+
output = self.model.create_chat_completion(messages=inputs[0], stream=True, **generate_kwargs)
249+
for chunk in output:
250+
logger.info(f'LlamaCppPipeline -> create_chat_completion -> Yield -> "{chunk}" -> "{type(chunk)}"')
251+
delta = chunk['choices'][0]['delta']
252+
val = ''
253+
if 'role' in delta:
254+
val = ''
255+
elif 'content' in delta:
256+
val = delta['content']
257+
yield val
258+
else:
259+
input_ids = self.model.tokenizer(inputs)
260+
output = self.model.generate(tokens=input_ids, **generate_kwargs)
261+
for token in output:
262+
val = self.model.detokenize([token])
263+
logger.info(f'LlamaCppPipeline -> generate -> Yield -> "{val}" -> "{type(val)}"')
264+
yield val
265+
266+
# streaming sample for test
267+
# start = 0
268+
# while True:
269+
# val = prompt + str(start)
270+
# logger.info(f"LlamaCppPipeline.streamGenerate -> yield -> {val}")
271+
# yield val
272+
# start += 1

llmserve/backend/server/app.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -411,7 +411,10 @@ async def query(self, *args) -> Dict[str, Dict[str, Any]]:
411411
else:
412412
prompts = args[0]
413413
logger.info(f"ExperimentalDeployment query.prompts {prompts}")
414-
results = await asyncio.gather(*[(self._model.generate_text.remote(Prompt(prompt=prompts, use_prompt_format=False)))])
414+
use_prompt_format = False
415+
if self._model_configuration.model_config.generation.prompt_format:
416+
use_prompt_format = True
417+
results = await asyncio.gather(*[(self._model.generate_text.remote(Prompt(prompt=prompts, use_prompt_format=use_prompt_format)))])
415418
logger.info(f"ExperimentalDeployment query.results {results}")
416419
results = results[0]
417420
return results.generated_text

llmserve/backend/server/config.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -132,7 +132,8 @@
132132
"OpenCSG/opencsg-starcoder-v0.1": "./models/text-generation--opencsg--opencsg-starcoder-15B-v0.1-pipeline.yaml",
133133
"opencsg/opencsg-deepseek-coder-1.3b-v0.1": "./models/text-generation--opencsg--opencsg-deepseek-coder-1.3b-v0.1.yaml",
134134
"OpenCSG/opencsg-deepseek-coder-1.3b-v0.1": "./models/text-generation--opencsg--opencsg-deepseek-coder-1.3b-v0.1.yaml",
135-
"Qwen/Qwen1.5-72B-Chat": "./models/text-generation--Qwen--Qwen1.5-72B-Chat.yaml"
135+
"Qwen/Qwen1.5-72B-Chat": "./models/text-generation--Qwen--Qwen1.5-72B-Chat.yaml",
136+
"Qwen/Qwen1.5-72B-Chat-GGUF": "./models/text-generation--Qwen1.5-72B-Chat-GGUF.yaml"
136137
}
137138

138139
SERVE_RUN_HOST = "0.0.0.0"
Lines changed: 40 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,40 @@
1+
deployment_config:
2+
autoscaling_config:
3+
min_replicas: 1
4+
initial_replicas: 1
5+
max_replicas: 8
6+
target_num_ongoing_requests_per_replica: 1.0
7+
metrics_interval_s: 10.0
8+
look_back_period_s: 30.0
9+
smoothing_factor: 1.0
10+
downscale_delay_s: 300.0
11+
upscale_delay_s: 90.0
12+
ray_actor_options:
13+
num_cpus: 2 # for a model deployment, we have 3 actor created, 1 and 2 will cost 0.1 cpu, and the model inference will cost 6(see the setting in the end of the file)
14+
model_config:
15+
# stream: True
16+
warmup: True
17+
model_task: text-generation
18+
model_id: Qwen/Qwen1.5-72B-Chat-GGUF
19+
max_input_words: 512
20+
initialization:
21+
# s3_mirror_config:
22+
# bucket_uri: /data/models/Qwen1.5-72B-Chat-GGUF/
23+
initializer:
24+
type: LlamaCpp
25+
model_filename: qwen1_5-72b-chat-q5_k_m.gguf
26+
model_init_kwargs:
27+
test: true
28+
n_gpu_layers: -1
29+
pipeline: llamacpp
30+
generation:
31+
max_batch_size: 1
32+
batch_wait_timeout_s: 0
33+
generate_kwargs:
34+
max_tokens: 512
35+
prompt_format: '[{{"role": "system", "content": "You are a helpful assistant."}},{{"role": "user", "content": "{instruction}"}}]'
36+
stopping_sequences: ["<|im_end|>"]
37+
scaling_config:
38+
num_workers: 1
39+
num_gpus_per_worker: 6
40+
num_cpus_per_worker: 8 # for inference

0 commit comments

Comments
 (0)