Skip to content

Commit 7d4d6cc

Browse files
authored
[TRTLLM-7292][feat] Support multi-threaded tokenizers for trtllm-serve (cherry-pick) (#7776)
Signed-off-by: Yilin Fan <206948969+nv-yilinf@users.noreply.github.com>
1 parent 1f2761e commit 7d4d6cc

File tree

1 file changed

+10
-1
lines changed

1 file changed

+10
-1
lines changed

tensorrt_llm/serve/openai_server.py

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@
2323
from tensorrt_llm.executor import CppExecutorError
2424
from tensorrt_llm.executor.postproc_worker import PostprocParams
2525
from tensorrt_llm.inputs import prompt_inputs
26+
from tensorrt_llm.inputs.data import TokensPrompt
2627
from tensorrt_llm.inputs.utils import ConversationMessage, apply_chat_template
2728
from tensorrt_llm.llmapi import DisaggregatedParams as LlmDisaggregatedParams
2829
from tensorrt_llm.llmapi import MultimodalEncoder
@@ -677,8 +678,16 @@ async def generator_wrapper(generator: AsyncIterator[Any]):
677678
if request.stream else completion_response_post_processor,
678679
postproc_args=postproc_args,
679680
)
681+
682+
prompt = prompt_inputs(prompt)
683+
if prompt.get("prompt") is not None:
684+
prompt_token_ids, extra_processed_inputs = await asyncio.to_thread(self.llm.input_processor, prompt, sampling_params)
685+
tokens_prompt = TokensPrompt(prompt_token_ids=prompt_token_ids, query_token_ids=extra_processed_inputs.get("query_token_ids") if extra_processed_inputs is not None else None)
686+
else:
687+
tokens_prompt = prompt
688+
680689
promise = self.llm.generate_async(
681-
inputs=prompt,
690+
inputs=tokens_prompt,
682691
sampling_params=sampling_params,
683692
_postproc_params=postproc_params,
684693
streaming=request.stream,

0 commit comments

Comments
 (0)