[Serving] Use stop strs and token ids for completions (#2534)

MasterJH5574 · web-flow · commit 5f71aa9890cb · 2024-06-06T21:44:26.000-04:00
This PR applies the stop strings and stop token ids defined in
conversation tempalte to the raw text completions. So that whenever
the model outputs a stop token id or stop string, the raw generation
can stop.

Prior to this commit, the raw text never stops when the max tokens
is not given. This commit helps reduce the frequency of such events.
Nevertheless, if the model does not output a stop string/token id,
the generation will still not be going to stop.
diff --git a/python/mlc_llm/serve/engine.py b/python/mlc_llm/serve/engine.py
@@ -1298,6 +1298,7 @@ async def _handle_completion(
             self.state,
             self.tokenizer,
             self.max_input_sequence_length,
+            self.conv_template.model_copy(deep=True),
         )
         _ = prompt_length
         if echo_response is not None:
@@ -1840,6 +1841,7 @@ def _handle_completion(
             self.state,
             self.tokenizer,
             self.max_input_sequence_length,
+            self.conv_template.model_copy(deep=True),
         )
         _ = prompt_length
         if echo_response is not None:
diff --git a/python/mlc_llm/serve/engine_base.py b/python/mlc_llm/serve/engine_base.py
@@ -862,12 +862,13 @@ def process_chat_completion_stream_output(  # pylint: disable=too-many-arguments
     return response
 
 
-def process_completion_request(
+def process_completion_request(  # pylint: disable=too-many-arguments
     request: openai_api_protocol.CompletionRequest,
     request_id: str,
     engine_state: EngineState,
     tokenizer: Tokenizer,
     max_input_sequence_length: int,
+    conv_template: Conversation,
 ) -> Tuple[List[int], GenerationConfig, int, Optional[openai_api_protocol.CompletionResponse]]:
     """Process the given CompletionRequest, apply request validity
     checks, and return the processed prompts, and other info.
@@ -889,6 +890,9 @@ def process_completion_request(
     max_input_sequence_length : int
         The maximum allowed total prompt length.
 
+    conv_template : Conversation
+        The conversation template of the model.
+
     Returns
     -------
     prompt : List[int]
@@ -917,7 +921,11 @@ def process_completion_request(
     assert isinstance(prompt, list)
 
     # Process generation config. Create request id.
-    generation_cfg = engine_utils.get_generation_config(request)
+    generation_cfg = engine_utils.get_generation_config(
+        request,
+        extra_stop_token_ids=conv_template.stop_token_ids,
+        extra_stop_str=conv_template.stop_str,
+    )
 
     # - Echo back the prompt.
     echo_response = None