@@ -703,13 +703,10 @@ def create_tokens_prompt_from_beam(
703
703
704
704
return outputs
705
705
706
- def chat (
706
+ def preprocess_chat (
707
707
self ,
708
708
messages : Union [list [ChatCompletionMessageParam ],
709
709
list [list [ChatCompletionMessageParam ]]],
710
- sampling_params : Optional [Union [SamplingParams ,
711
- list [SamplingParams ]]] = None ,
712
- use_tqdm : Union [bool , Callable [..., tqdm ]] = True ,
713
710
lora_request : Optional [LoRARequest ] = None ,
714
711
chat_template : Optional [str ] = None ,
715
712
chat_template_content_format : ChatTemplateContentFormatOption = "auto" ,
@@ -718,56 +715,16 @@ def chat(
718
715
tools : Optional [list [dict [str , Any ]]] = None ,
719
716
chat_template_kwargs : Optional [dict [str , Any ]] = None ,
720
717
mm_processor_kwargs : Optional [dict [str , Any ]] = None ,
721
- ) -> list [RequestOutput ]:
718
+ ) -> list [TokensPrompt ]:
722
719
"""
723
- Generate responses for a chat conversation.
724
-
725
- The chat conversation is converted into a text prompt using the
726
- tokenizer and calls the [generate][vllm.LLM.generate] method to generate
727
- the responses.
728
-
729
- Multi-modal inputs can be passed in the same way you would pass them
730
- to the OpenAI API.
731
-
732
- Args:
733
- messages: A list of conversations or a single conversation.
734
-
735
- - Each conversation is represented as a list of messages.
736
- - Each message is a dictionary with 'role' and 'content' keys.
737
-
738
- sampling_params: The sampling parameters for text generation.
739
- If None, we use the default sampling parameters. When it
740
- is a single value, it is applied to every prompt. When it
741
- is a list, the list must have the same length as the
742
- prompts and it is paired one by one with the prompt.
743
- use_tqdm: If `True`, shows a tqdm progress bar.
744
- If a callable (e.g., `functools.partial(tqdm, leave=False)`),
745
- it is used to create the progress bar.
746
- If `False`, no progress bar is created.
747
- lora_request: LoRA request to use for generation, if any.
748
- chat_template: The template to use for structuring the chat.
749
- If not provided, the model's default chat template will be used.
750
- chat_template_content_format: The format to render message content.
751
-
752
- - "string" will render the content as a string.
753
- Example: `"Who are you?"`
754
- - "openai" will render the content as a list of dictionaries,
755
- similar to OpenAI schema.
756
- Example: `[{"type": "text", "text": "Who are you?"}]`
757
-
758
- add_generation_prompt: If True, adds a generation template
759
- to each message.
760
- continue_final_message: If True, continues the final message in
761
- the conversation instead of starting a new one. Cannot be
762
- `True` if `add_generation_prompt` is also `True`.
763
- chat_template_kwargs: Additional kwargs to pass to the chat
764
- template.
765
- mm_processor_kwargs: Multimodal processor kwarg overrides for this
766
- chat request. Only used for offline requests.
720
+ Generate prompt for a chat conversation. The pre-processed
721
+ prompt can then be used as input for the other LLM methods.
767
722
723
+ Refer to `chat` for a complete description of the arguments.
768
724
Returns:
769
- A list of `RequestOutput` objects containing the generated
770
- responses in the same order as the input messages.
725
+ A list of `TokensPrompts` objects containing the tokenized
726
+ prompt after chat template interpolation, and the
727
+ pre-processed multi-modal inputs.
771
728
"""
772
729
list_of_messages : list [list [ChatCompletionMessageParam ]]
773
730
@@ -800,7 +757,7 @@ def chat(
800
757
)
801
758
_chat_template_kwargs .update (chat_template_kwargs or {})
802
759
803
- prompts : list [Union [ TokensPrompt , TextPrompt ] ] = []
760
+ prompts : list [TokensPrompt ] = []
804
761
805
762
for msgs in list_of_messages :
806
763
# NOTE: _parse_chat_message_content_parts() currently doesn't
@@ -844,6 +801,87 @@ def chat(
844
801
845
802
prompts .append (prompt )
846
803
804
+ return prompts
805
+
806
+ def chat (
807
+ self ,
808
+ messages : Union [list [ChatCompletionMessageParam ],
809
+ list [list [ChatCompletionMessageParam ]]],
810
+ sampling_params : Optional [Union [SamplingParams ,
811
+ list [SamplingParams ]]] = None ,
812
+ use_tqdm : Union [bool , Callable [..., tqdm ]] = True ,
813
+ lora_request : Optional [LoRARequest ] = None ,
814
+ chat_template : Optional [str ] = None ,
815
+ chat_template_content_format : ChatTemplateContentFormatOption = "auto" ,
816
+ add_generation_prompt : bool = True ,
817
+ continue_final_message : bool = False ,
818
+ tools : Optional [list [dict [str , Any ]]] = None ,
819
+ chat_template_kwargs : Optional [dict [str , Any ]] = None ,
820
+ mm_processor_kwargs : Optional [dict [str , Any ]] = None ,
821
+ ) -> list [RequestOutput ]:
822
+ """
823
+ Generate responses for a chat conversation.
824
+
825
+ The chat conversation is converted into a text prompt using the
826
+ tokenizer and calls the [generate][vllm.LLM.generate] method to generate
827
+ the responses.
828
+
829
+ Multi-modal inputs can be passed in the same way you would pass them
830
+ to the OpenAI API.
831
+
832
+ Args:
833
+ messages: A list of conversations or a single conversation.
834
+
835
+ - Each conversation is represented as a list of messages.
836
+ - Each message is a dictionary with 'role' and 'content' keys.
837
+
838
+ sampling_params: The sampling parameters for text generation.
839
+ If None, we use the default sampling parameters. When it
840
+ is a single value, it is applied to every prompt. When it
841
+ is a list, the list must have the same length as the
842
+ prompts and it is paired one by one with the prompt.
843
+ use_tqdm: If `True`, shows a tqdm progress bar.
844
+ If a callable (e.g., `functools.partial(tqdm, leave=False)`),
845
+ it is used to create the progress bar.
846
+ If `False`, no progress bar is created.
847
+ lora_request: LoRA request to use for generation, if any.
848
+ chat_template: The template to use for structuring the chat.
849
+ If not provided, the model's default chat template will be used.
850
+ chat_template_content_format: The format to render message content.
851
+
852
+ - "string" will render the content as a string.
853
+ Example: `"Who are you?"`
854
+ - "openai" will render the content as a list of dictionaries,
855
+ similar to OpenAI schema.
856
+ Example: `[{"type": "text", "text": "Who are you?"}]`
857
+
858
+ add_generation_prompt: If True, adds a generation template
859
+ to each message.
860
+ continue_final_message: If True, continues the final message in
861
+ the conversation instead of starting a new one. Cannot be
862
+ `True` if `add_generation_prompt` is also `True`.
863
+ chat_template_kwargs: Additional kwargs to pass to the chat
864
+ template.
865
+ mm_processor_kwargs: Multimodal processor kwarg overrides for this
866
+ chat request. Only used for offline requests.
867
+
868
+ Returns:
869
+ A list of `RequestOutput` objects containing the generated
870
+ responses in the same order as the input messages.
871
+ """
872
+
873
+ prompts = self .preprocess_chat (
874
+ messages = messages ,
875
+ lora_request = lora_request ,
876
+ chat_template = chat_template ,
877
+ chat_template_content_format = chat_template_content_format ,
878
+ add_generation_prompt = add_generation_prompt ,
879
+ continue_final_message = continue_final_message ,
880
+ tools = tools ,
881
+ chat_template_kwargs = chat_template_kwargs ,
882
+ mm_processor_kwargs = mm_processor_kwargs ,
883
+ )
884
+
847
885
return self .generate (
848
886
prompts ,
849
887
sampling_params = sampling_params ,
0 commit comments