1111 apply_hf_chat_template ,
1212 parse_chat_messages_futures ,
1313)
14+ from vllm .config .structured_outputs import StructuredOutputsConfig
1415from gpt_server .settings import get_model_config
1516
1617
@@ -46,11 +47,13 @@ def __init__(self, model_path, tokenizer: PreTrainedTokenizer) -> None:
4647 enable_prefix_caching = model_config .enable_prefix_caching ,
4748 dtype = model_config .dtype ,
4849 max_model_len = model_config .max_model_len ,
49- guided_decoding_backend = "xgrammar" ,
50+ # guided_decoding_backend="xgrammar",
5051 # 支持LMCache的KV传输
5152 kv_transfer_config = KVTransferConfig (
5253 kv_connector = "LMCacheConnectorV1" , kv_role = "kv_both"
5354 ),
55+ prefix_caching_hash_algo = "xxhash" ,
56+ structured_outputs_config = StructuredOutputsConfig (backend = "xgrammar" ),
5457 )
5558 self .engine = AsyncLLMEngine .from_engine_args (self .engine_args )
5659 self .tokenizer = tokenizer
@@ -86,11 +89,11 @@ async def stream_chat(self, params: Dict[str, Any]) -> AsyncGenerator:
8689
8790 multimodal = params .get ("multimodal" , False )
8891 tokenizer = await self .engine .get_tokenizer ()
92+ model_config = self .engine .model_config
8993 if multimodal : # 多模态模型
9094 # ----------------------------------------------------------------
91- model_config = await self .engine .get_model_config ()
9295 conversation , mm_data_future , _ = parse_chat_messages_futures (
93- messages , model_config , tokenizer , content_format = "string"
96+ messages , model_config , content_format = "string"
9497 )
9598
9699 prompt = apply_hf_chat_template (
@@ -101,7 +104,7 @@ async def stream_chat(self, params: Dict[str, Any]) -> AsyncGenerator:
101104 ),
102105 add_generation_prompt = True ,
103106 tools = tools ,
104- model_config = await self . engine . get_model_config () ,
107+ model_config = model_config ,
105108 enable_thinking = enable_thinking ,
106109 )
107110 mm_data = await mm_data_future
@@ -116,7 +119,7 @@ async def stream_chat(self, params: Dict[str, Any]) -> AsyncGenerator:
116119 ),
117120 add_generation_prompt = True ,
118121 tools = tools ,
119- model_config = await self . engine . get_model_config () ,
122+ model_config = model_config ,
120123 enable_thinking = enable_thinking ,
121124 )
122125 input_ids = params .get ("input_ids" , None )
0 commit comments