Skip to content

Commit 428c527

Browse files
committed
update demo script: fix stream_scrippt
1 parent 7b6fc49 commit 428c527

File tree

2 files changed

+35
-52
lines changed

2 files changed

+35
-52
lines changed

vllm/demo/run_vllm_serving.sh

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,12 @@ CONFIG_FILE="model_config.json"
66

77
if [ -z "$MODEL_NAME" ]; then
88
echo "× Please provide the model name, for example:"
9-
echo " ./run_vllm_serving.sh DeepSeek-R1-Distill-Qwen-1.5B"
9+
echo " ./run_vllm_serving.sh DeepSeek-R1-Distill-Qwen-1.5B"
10+
exit 1
11+
fi
12+
13+
if [ ! -f "$CONFIG_FILE" ]; then
14+
echo "× Error: Config file $CONFIG_FILE not found! Please make sure it exists."
1015
exit 1
1116
fi
1217

@@ -21,7 +26,7 @@ print(info.get('url', ''), info.get('tensor_parallel_size', ''))
2126
")
2227

2328
if [ -z "$MODEL_URL" ]; then
24-
echo "× $MODEL_NAME is not supported yet, please refer to the website to try other models: https://docs.mthreads.com/mtt/mtt-doc-online/compability"
29+
echo "× $MODEL_NAME is not supported, please refer to the website to try other models: https://docs.mthreads.com/mtt/mtt-doc-online/compability"
2530
exit 1
2631
fi
2732

vllm/stream_chat.py

Lines changed: 28 additions & 50 deletions
Original file line numberDiff line numberDiff line change
@@ -12,81 +12,60 @@
1212
try:
1313
readline.read_history_file(HISTORY_FILE)
1414
except FileNotFoundError:
15-
pass
15+
pass
1616

1717

18-
thinking_flag = False
19-
stop_thinking = threading.Event()
20-
21-
def thinking_animation():
22-
dots = 0
23-
while not stop_thinking.is_set():
24-
dots = (dots % 6) + 1
25-
sys.stdout.write(f"\rAI: Thinking{'.' * dots} ")
26-
sys.stdout.flush()
27-
time.sleep(1)
28-
sys.stdout.write("\r" + " " * 20 + "\r")
29-
sys.stdout.flush()
30-
3118
def stream_vllm_response(messages, model="deepseek_test"):
3219
global thinking_flag
3320
headers = {"Content-Type": "application/json"}
34-
payload = {
35-
"model": model,
36-
"messages": messages,
37-
"stream": True
38-
}
21+
payload = {"model": model, "messages": messages, "stream": True}
3922

4023
token_count = 0
4124
start_time = time.time()
42-
43-
with requests.post(VLLM_API_URL, headers=headers, json=payload, stream=True) as response:
44-
found_think_tag = False
45-
thinking_flag = True
46-
stop_thinking.clear()
4725

48-
49-
thinking_thread = threading.Thread(target=thinking_animation)
50-
thinking_thread.start()
26+
with requests.post(
27+
VLLM_API_URL, headers=headers, json=payload, stream=True
28+
) as response:
5129

52-
output_buffer = ""
30+
output_buffer = ""
5331

5432
for line in response.iter_lines():
5533
if line:
5634
try:
57-
data = json.loads(line.decode("utf-8")[6:])
35+
data = json.loads(line.decode("utf-8")[6:])
5836
if "choices" in data and data["choices"]:
5937
token = data["choices"][0]["delta"].get("content", "")
6038
if token:
61-
if not found_think_tag and "</think>" in token:
62-
found_think_tag = True
63-
token = token.split("</think>", 1)[-1] # delete `<think>... </think>` for deepseek
64-
65-
if found_think_tag:
66-
if thinking_flag:
67-
stop_thinking.set()
68-
thinking_thread.join()
69-
thinking_flag = False
70-
print(token, end="", flush=True)
7139
token_count += 1
7240
except json.JSONDecodeError:
7341
continue
74-
42+
7543
elapsed_time = time.time() - start_time
7644
tokens_per_sec = token_count / elapsed_time if elapsed_time > 0 else 0
77-
print(f"\n\n[Model Metrics] Tokens: {token_count}, Time: {elapsed_time:.2f}s, Tokens/s: {tokens_per_sec:.2f}")
45+
print(
46+
f"\n\n[Model Metrics] Tokens: {token_count}, Time: {elapsed_time:.2f}s, Tokens/s: {tokens_per_sec:.2f}"
47+
)
48+
7849

7950
if __name__ == "__main__":
80-
parser = argparse.ArgumentParser(description="Interactive chat client for vLLM with streaming.")
81-
parser.add_argument("--model-id",dest="model_id", type=str, required=True, help="Specify the serve model name ")
51+
parser = argparse.ArgumentParser(
52+
description="Interactive chat client for vLLM with streaming."
53+
)
54+
parser.add_argument(
55+
"--model-id",
56+
dest="model_id",
57+
type=str,
58+
required=True,
59+
help="Specify the serve model name ",
60+
)
8261
args = parser.parse_args()
83-
62+
8463
messages = [{"role": "system", "content": "You are a helpful AI assistant."}]
8564

8665
while True:
8766
try:
88-
user_input = input("\nUser: ")
89-
readline.write_history_file(HISTORY_FILE)
67+
user_input = input("\nUser: ")
68+
readline.write_history_file(HISTORY_FILE)
9069
except (KeyboardInterrupt, EOFError):
9170
print("\n聊天结束。")
9271
break
@@ -95,7 +74,6 @@ def stream_vllm_response(messages, model="deepseek_test"):
9574
print("聊天结束。")
9675
break
9776

98-
messages.append({"role": "user", "content": user_input}) # 记录用户输入
99-
stream_vllm_response(messages,args.model_id) # 交互式流式输出
100-
messages.append({"role": "assistant", "content": ""}) # 记录 AI 回复
101-
77+
messages.append({"role": "user", "content": user_input}) # record user input
78+
stream_vllm_response(messages, args.model_id)
79+
messages.append({"role": "assistant", "content": ""})

0 commit comments

Comments
 (0)