File tree Expand file tree Collapse file tree 1 file changed +15
-1
lines changed
llmserve/backend/llm/engines/vllm Expand file tree Collapse file tree 1 file changed +15
-1
lines changed Original file line number Diff line number Diff line change 16
16
import uuid
17
17
import ray
18
18
from llmserve .backend .server .utils import render_gradio_params
19
+ import json
19
20
20
21
21
22
from vllm .outputs import RequestOutput
@@ -163,9 +164,22 @@ async def predict(
163
164
164
165
st = time .monotonic ()
165
166
request_id = str (uuid .uuid4 ())
167
+ tokenizer = self .engine .engine .tokenizer
168
+ prompt_text = inputs [0 ]
169
+
170
+ try :
171
+ prompt_text_bak = prompt_text
172
+ prompt_text = json .loads (prompt_text , strict = False )
173
+ prompt_text = tokenizer .apply_chat_template (prompt_text , tokenize = False , add_generation_prompt = True )
174
+ except Exception as ex :
175
+ logger .warn (f"Exception apply_chat_template: { ex } " )
176
+ logger .info ("Seems no chat template from user or the model donot has a 'chat template'" )
177
+ prompt_text = prompt_text_bak
178
+
179
+ logger .info (f"final prompt is: { prompt_text } " )
166
180
# Construct a results generator from VLLM
167
181
results_generator : AsyncIterator [RequestOutput ] = self .engine .generate (
168
- inputs [ 0 ] ,
182
+ prompt_text ,
169
183
self ._parse_sampling_params (sampling_params ),
170
184
request_id ,
171
185
)
You can’t perform that action at this time.
0 commit comments