1212try :
1313 readline .read_history_file (HISTORY_FILE )
1414except FileNotFoundError :
15- pass
15+ pass
1616
1717
18- thinking_flag = False
19- stop_thinking = threading .Event ()
20-
21- def thinking_animation ():
22- dots = 0
23- while not stop_thinking .is_set ():
24- dots = (dots % 6 ) + 1
25- sys .stdout .write (f"\r AI: Thinking{ '.' * dots } " )
26- sys .stdout .flush ()
27- time .sleep (1 )
28- sys .stdout .write ("\r " + " " * 20 + "\r " )
29- sys .stdout .flush ()
30-
3118def stream_vllm_response (messages , model = "deepseek_test" ):
3219 global thinking_flag
3320 headers = {"Content-Type" : "application/json" }
34- payload = {
35- "model" : model ,
36- "messages" : messages ,
37- "stream" : True
38- }
21+ payload = {"model" : model , "messages" : messages , "stream" : True }
3922
4023 token_count = 0
4124 start_time = time .time ()
42-
43- with requests .post (VLLM_API_URL , headers = headers , json = payload , stream = True ) as response :
44- found_think_tag = False
45- thinking_flag = True
46- stop_thinking .clear ()
4725
48-
49- thinking_thread = threading . Thread ( target = thinking_animation )
50- thinking_thread . start ()
26+ with requests . post (
27+ VLLM_API_URL , headers = headers , json = payload , stream = True
28+ ) as response :
5129
52- output_buffer = ""
30+ output_buffer = ""
5331
5432 for line in response .iter_lines ():
5533 if line :
5634 try :
57- data = json .loads (line .decode ("utf-8" )[6 :])
35+ data = json .loads (line .decode ("utf-8" )[6 :])
5836 if "choices" in data and data ["choices" ]:
5937 token = data ["choices" ][0 ]["delta" ].get ("content" , "" )
6038 if token :
61- if not found_think_tag and "</think>" in token :
62- found_think_tag = True
63- token = token .split ("</think>" , 1 )[- 1 ] # delete `<think>... </think>` for deepseek
64-
65- if found_think_tag :
66- if thinking_flag :
67- stop_thinking .set ()
68- thinking_thread .join ()
69- thinking_flag = False
70- print (token , end = "" , flush = True )
39+ print (token , end = "" , flush = True )
7140 token_count += 1
7241 except json .JSONDecodeError :
7342 continue
74-
43+
7544 elapsed_time = time .time () - start_time
7645 tokens_per_sec = token_count / elapsed_time if elapsed_time > 0 else 0
77- print (f"\n \n [Model Metrics] Tokens: { token_count } , Time: { elapsed_time :.2f} s, Tokens/s: { tokens_per_sec :.2f} " )
46+ print (
47+ f"\n \n [Model Metrics] Tokens: { token_count } , Time: { elapsed_time :.2f} s, Tokens/s: { tokens_per_sec :.2f} "
48+ )
49+
7850
7951if __name__ == "__main__" :
80- parser = argparse .ArgumentParser (description = "Interactive chat client for vLLM with streaming." )
81- parser .add_argument ("--model-id" ,dest = "model_id" , type = str , required = True , help = "Specify the serve model name " )
52+ parser = argparse .ArgumentParser (
53+ description = "Interactive chat client for vLLM with streaming."
54+ )
55+ parser .add_argument (
56+ "--model-id" ,
57+ dest = "model_id" ,
58+ type = str ,
59+ required = True ,
60+ help = "Specify the serve model name " ,
61+ )
8262 args = parser .parse_args ()
83-
63+
8464 messages = [{"role" : "system" , "content" : "You are a helpful AI assistant." }]
8565
8666 while True :
8767 try :
88- user_input = input ("\n User: " )
89- readline .write_history_file (HISTORY_FILE )
68+ user_input = input ("\n User: " )
69+ readline .write_history_file (HISTORY_FILE )
9070 except (KeyboardInterrupt , EOFError ):
9171 print ("\n 聊天结束。" )
9272 break
@@ -95,7 +75,6 @@ def stream_vllm_response(messages, model="deepseek_test"):
9575 print ("聊天结束。" )
9676 break
9777
98- messages .append ({"role" : "user" , "content" : user_input }) # 记录用户输入
99- stream_vllm_response (messages ,args .model_id ) # 交互式流式输出
100- messages .append ({"role" : "assistant" , "content" : "" }) # 记录 AI 回复
101-
78+ messages .append ({"role" : "user" , "content" : user_input }) # record user input
79+ stream_vllm_response (messages , args .model_id )
80+ messages .append ({"role" : "assistant" , "content" : "" })
0 commit comments