Skip to content

Commit a636a52

Browse files
enable chat template for huggingface transformer (#54)
1 parent a0c34a3 commit a636a52

File tree

8 files changed

+75
-34
lines changed

8 files changed

+75
-34
lines changed

llmserve/backend/llm/pipelines/default_pipeline.py

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66

77
from llmserve.backend.logger import get_logger
88
from llmserve.backend.server.models import Response
9+
import json
910

1011
from ._base import BasePipeline
1112
from .processors import StopOnTokens
@@ -54,6 +55,16 @@ def preprocess(self, prompts: List[str], **generate_kwargs):
5455
if self.tokenizer.pad_token is None:
5556
self.tokenizer.pad_token = self.tokenizer.eos_token
5657

58+
try:
59+
prompt_text_bak = prompt_text
60+
prompt_text = [json.loads(prompt) for prompt in prompt_text]
61+
prompt_text = [self.tokenizer.apply_chat_template(prompt_obj, tokenize=False, add_generation_prompt=True) for prompt_obj in prompt_text]
62+
except:
63+
logger.info("Seems no chat template from user or the model donot has a 'chat template'")
64+
prompt_text = prompt_text_bak
65+
66+
logger.info(f"Call model.generate with input: {prompt_text}")
67+
5768
inputs = self.tokenizer(
5869
prompt_text, return_tensors="pt", add_special_tokens = generate_kwargs.get("add_special_tokens", True), padding=True
5970
).to(self.model.device if hasattr(self.model, 'device') else self.device)

llmserve/backend/llm/pipelines/default_transformers_pipeline.py

Lines changed: 13 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -2,14 +2,15 @@
22

33
import torch
44
import time
5+
import json
56
from transformers import Pipeline as TransformersPipeline
67
from transformers import PreTrainedModel, PreTrainedTokenizer, pipeline
78

89
from llmserve.backend.logger import get_logger
910
from llmserve.backend.server.models import Prompt, Response
1011

1112
from ._base import BasePipeline
12-
from .utils import construct_prompts_experimental, truncate_to_first_stop_token
13+
from .utils import construct_prompts
1314
from llmserve.backend.server.utils import render_gradio_params
1415
from .default_pipeline import DefaultPipeline
1516

@@ -135,12 +136,20 @@ def preprocess(self, prompts: List[str], **generate_kwargs):
135136
st = time.monotonic()
136137
inputs = None
137138
logger.info(f"input from pipeline: ****** {prompts}")
138-
prompt_text = construct_prompts_experimental(
139+
prompt_text = construct_prompts(
139140
prompts, prompt_format=self.prompt_format)
140-
instruction_text = construct_prompts_experimental(prompts, prompt_format="")
141+
instruction_text = construct_prompts(prompts, prompt_format="")
141142
logger.info(f"input from pipeline: ****** {prompt_text}")
142143

143144
if isinstance(self.pipeline, transformers.pipelines.text_generation.TextGenerationPipeline):
145+
try:
146+
prompt_text_bak = prompt_text
147+
prompt_text = [json.loads(prompt) for prompt in prompt_text]
148+
prompt_text = [self.tokenizer.apply_chat_template(prompt_obj, tokenize=False, add_generation_prompt=True) for prompt_obj in prompt_text]
149+
except:
150+
logger.info("Seems no chat template from user or the model donot has a 'chat template'")
151+
prompt_text = prompt_text_bak
152+
144153
inputs = self.tokenizer(
145154
prompt_text, return_tensors="pt", add_special_tokens = generate_kwargs.get("add_special_tokens", True), padding=True
146155
)
@@ -224,7 +233,7 @@ def postprocess(self, model_outputs, **postprocess_kwargs) -> List[Response]:
224233
output).input_ids)
225234
num_input_tokens = len(self.tokenizer(inputs[index]))
226235
response = Response(
227-
generated_text=output,
236+
generated_text=output[len(inputs[index]):],
228237
num_generated_tokens=num_generated_tokens,
229238
num_input_tokens=num_input_tokens,
230239
)

llmserve/backend/llm/pipelines/utils.py

Lines changed: 0 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -69,25 +69,6 @@ def construct_prompts(
6969
prompts = [prompts]
7070
return [_construct_prompt(prompt, prompt_format) for prompt in prompts]
7171

72-
73-
def construct_prompts_experimental(
74-
prompts: Union[str, Prompt, List[str], List[Prompt], Tuple[str]],
75-
prompt_format: str,
76-
) -> List[str]:
77-
"""Construct prompts from a prompt string or list of prompts."""
78-
if not isinstance(prompts, list):
79-
prompts = [prompts]
80-
81-
params = []
82-
for prompt in prompts:
83-
if isinstance(prompt, Prompt) and isinstance(prompt.prompt, Tuple):
84-
params += [_construct_prompt(prompt, prompt_format)
85-
for prompt in prompt.prompt]
86-
else:
87-
params.append(_construct_prompt(prompt, prompt_format))
88-
return params
89-
90-
9172
def tokenize_stopping_sequences_where_needed(
9273
tokenizer: PreTrainedTokenizer,
9374
stopping_sequences: List[Union[str, int, List[int]]],

llmserve/backend/llm/predictor.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -113,7 +113,7 @@ def init_model(
113113
logger.info("start to test with single prompt")
114114
logger.info(f"warmpup prompt is: {warmup_inputs}")
115115
resp = generate(
116-
[warmup_inputs],
116+
[Prompt(prompt=warmup_inputs, use_prompt_format=False)],
117117
pipeline,
118118
**generate_kwargs,
119119
)

llmserve/backend/llm/utils.py

Lines changed: 0 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -279,14 +279,6 @@ async def init_torch_dist_process_group_async(
279279
node_id = node_and_gpu_ids[rank][0]
280280
local_rank = node_to_workers[node_id].index(rank)
281281
local_world_size = len(node_to_workers[node_id])
282-
logger.info("++++++++++++++")
283-
logger.info(rank)
284-
logger.info(world_size)
285-
logger.info(local_rank)
286-
logger.info(local_world_size)
287-
logger.info(master_addr)
288-
logger.info(master_port)
289-
logger.info(list(node_to_gpu_ids[node_id]))
290282
setup_futures.append(
291283
worker.execute.remote(
292284
_init_torch_distributed,

llmserve/backend/server/utils.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -175,7 +175,7 @@ def render_gradio_params(hg_task: str) -> Dict[str, Any]:
175175
pipeline_info = {
176176
"inputs": gr.components.Textbox(label="Input"),
177177
"outputs": gr.components.Textbox(label="Output"),
178-
"preprocess": lambda x: {"text_inputs": [(text + "\n") for text in x]},
178+
"preprocess": lambda x: {"text_inputs": [text for text in x]},
179179
# "postprocess": lambda r: r[0]["generated_text"],
180180
"postprocess": lambda r: [text[0]['generated_text'] for text in r],
181181
"warmup": "Write a short story."
Lines changed: 48 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,48 @@
1+
deployment_config:
2+
autoscaling_config:
3+
min_replicas: 1
4+
initial_replicas: 1
5+
max_replicas: 8
6+
target_num_ongoing_requests_per_replica: 1.0
7+
metrics_interval_s: 10.0
8+
look_back_period_s: 30.0
9+
smoothing_factor: 1.0
10+
downscale_delay_s: 300.0
11+
upscale_delay_s: 90.0
12+
ray_actor_options:
13+
num_cpus: 0.1 # for a model deployment, we have 3 actor created, 1 and 2 will cost 0.1 cpu, and the model infrence will cost 6(see the setting in the end of the file)
14+
model_config:
15+
warmup: True
16+
model_task: text-generation
17+
model_id: facebook/blenderbot-400M-distill
18+
max_input_words: 48
19+
initialization:
20+
# s3_mirror_config:
21+
# endpoint_url: http://39.107.108.170:9000 # Optinal for custom S3 storage endpoint url
22+
# bucket_uri: s3://opt-125m/facemodel/ # Must include hash file with commit id in repo
23+
# bucket_uri: /tmp/hub/opt-125m/ # Local path of model with hash file
24+
# git_uri: https://portal.opencsg.com/models/opt-125m.git # git address for git clone
25+
initializer:
26+
type: SingleDevice
27+
dtype: float32
28+
from_pretrained_kwargs:
29+
use_cache: true
30+
trust_remote_code: true
31+
pipeline: default
32+
generation:
33+
max_batch_size: 1
34+
batch_wait_timeout_s: 0
35+
generate_kwargs:
36+
do_sample: true
37+
max_new_tokens: 24
38+
min_new_tokens: 16
39+
temperature: 0.7
40+
repetition_penalty: 1.1
41+
top_p: 0.8
42+
top_k: 50
43+
prompt_format: '[{{"role": "system", "content": "You are a friendly chatbot who always responds in the style of a pirate"}},{{"role": "user", "content": "{instruction}"}}]'
44+
#stopping_sequences: ["### Response:", "### End"]
45+
scaling_config:
46+
num_workers: 1
47+
num_gpus_per_worker: 0
48+
num_cpus_per_worker: 3 # for inference

setup.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -52,7 +52,7 @@
5252
"deepspeed==0.14.0",
5353
"torchmetrics==1.2.1",
5454
"llama_cpp_python==0.2.20",
55-
"transformers==4.33.3",
55+
"transformers==4.39.1",
5656
],
5757
"vllm": [
5858
"vllm==0.2.7",

0 commit comments

Comments
 (0)