Skip to content

Commit c526eb3

Browse files
authored
Merge pull request #41 from MooreThreads/add_webUI_demo
fix some problems for vllm demo
2 parents b21544b + 22d423f commit c526eb3

File tree

4 files changed

+17
-6
lines changed

4 files changed

+17
-6
lines changed

vllm/demo/gradio_demo/app.py

Lines changed: 9 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -57,6 +57,7 @@ def chat_with_model_streaming(user_input, history):
5757
# ✅ 记录开始时间
5858
start_time = time.time()
5959
token_count = 0 # ✅ 记录生成的 Token 数量
60+
first_token_time = None
6061

6162
try:
6263
# ✅ 使用 requests 的流式请求
@@ -77,14 +78,17 @@ def chat_with_model_streaming(user_input, history):
7778
bot_response += delta["content"]
7879
# ✅ 逐步更新聊天记录
7980
token_count += 1 # ✅ 每个 Token 计数
81+
if first_token_time is None and token_count > 0:
82+
first_token_time = time.time()
83+
8084
yield history + [(user_input, bot_response)], "", "推理中..."
8185
except json.JSONDecodeError:
8286
pass
8387
# ✅ 记录结束时间 & 计算时长
84-
elapsed_time = time.time() - start_time
88+
first_token_latency = first_token_time - start_time if first_token_time is not None else 0
89+
elapsed_time = time.time() - first_token_time
8590
tps = token_count / elapsed_time if elapsed_time > 0 else 0 # ✅ 计算 Tokens Per Second
86-
87-
speed_text = f"⏱️ 耗时: {elapsed_time:.2f} 秒 | 🔢 Tokens: {token_count} | ⚡ 速度: {tps:.2f} TPS"
91+
speed_text = f"⏳ 首字延迟: {first_token_latency:.2f} 秒 | ⏱️ 耗时: {elapsed_time:.2f} 秒 | 🔢 Tokens: {token_count} | ⚡ 速度: {tps:.2f} TPS" # ⏳
8892
yield history + [(user_input, bot_response)], "", speed_text # ✅ 返回推理速度
8993

9094
except Exception as e:
@@ -95,15 +99,15 @@ def chat_with_model_streaming(user_input, history):
9599

96100
# ✅ 清除聊天记录 & 计时器
97101
def clear_chat():
98-
return [], "", "⏱️ 耗时: 0.00 秒 | 🔢 Tokens: 0 | ⚡ 速度: 0.00 TPS" # ✅ 清空所有 UI
102+
return [], "", "⏳ 首字延迟: 0.00 秒 | ⏱️ 耗时: 0.00 秒 | 🔢 Tokens: 0 | ⚡ 速度: 0.00 TPS" # ✅ 清空所有 UI
99103

100104
# 构建 Gradio 界面
101105
with gradio_musa.Blocks() as demo:
102106
# gr.Markdown("## 💬 Web UI 接入 vLLM 模型(流式输出)")
103107
chatbot = gr.Chatbot(label="Running on MTT S4000")
104108
msg_input = gr.Textbox(placeholder="请输入你的问题", label="输入...", lines=1, autofocus=True)
105109

106-
speed_display = gr.Textbox(label="推理速度", value="⏱️ 耗时: 0.00 秒 | 🔢 Tokens: 0 | ⚡ 速度: 0.00 TPS", interactive=False) # >✅ 显示推理速度
110+
speed_display = gr.Textbox(label="推理速度", value="⏳ 首字延迟: 0.00 秒 | ⏱️ 耗时: 0.00 秒 | 🔢 Tokens: 0 | ⚡ 速度: 0.00 TPS", interactive=False) # >✅ 显示推理速度
107111

108112
# clear = gr.Button("清除")
109113
# submit = gr.Button("提交")

vllm/demo/gradio_demo/gradio_musa.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,6 @@
77
<div class="top">
88
<div class="top-container">
99
<img class="logo" width="140" height="37" src="https://kuae-playground.mthreads.com/image/[email protected]">
10-
<h2>GPU GENIUS</h2>
1110
</div>
1211
</div>"""
1312

vllm/demo/run_vllm_serving.sh

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -241,6 +241,7 @@ wait_for_log_update() {
241241
echo -e "\e[32mInstalling gradio...\e[0m" >&2
242242
pip install gradio
243243
echo -e "\e[32mStart gradio webui...\e[0m" >&2
244+
echo -e "\e[32mContainer: $CONTAINER_NAME\e[0m" >&2
244245
setsid python -u ./gradio_demo/app.py --ip "$host" --port "$port" --model-name "$model_name" | tee -a webui.log &
245246
wait $! # 等待该进程结束
246247
exit 0

vllm/demo/supported_models.json

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -25,5 +25,12 @@
2525
"modelscope_url": "https://www.modelscope.cn/Qwen/Qwen2.5-0.5B-Instruct.git",
2626
"huggingface_url": "https://huggingface.co/Qwen/Qwen2.5-0.5B-Instruct",
2727
"tensor_parallel_size": [1]
28+
},
29+
30+
"qwq-32b": {
31+
"name": "Qwen/QwQ-32B",
32+
"modelscope_url": "https://www.modelscope.cn/Qwen/QwQ-32B.git",
33+
"huggingface_url": "https://huggingface.co/Qwen/QwQ-32B",
34+
"tensor_parallel_size": [2, 4]
2835
}
2936
}

0 commit comments

Comments
 (0)