Skip to content

Commit cdbdb7c

Browse files
committed
complete webui for vllm
1 parent e487d63 commit cdbdb7c

File tree

5 files changed

+39
-193
lines changed

5 files changed

+39
-193
lines changed
-3.9 KB
Binary file not shown.

vllm/demo/gradio_demo/app.py

Lines changed: 36 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,8 @@
22
import requests
33
import json
44
import argparse
5+
import time
6+
import gradio_musa
57

68

79
def parse_args():
@@ -52,6 +54,10 @@ def chat_with_model_streaming(user_input, history):
5254
history = history or [] # 初始化历史记录
5355
bot_response = "" # 存储逐步生成的回答
5456

57+
# ✅ 记录开始时间
58+
start_time = time.time()
59+
token_count = 0 # ✅ 记录生成的 Token 数量
60+
5561
try:
5662
# ✅ 使用 requests 的流式请求
5763
with requests.post(VLLM_API_URL, json=payload, stream=True) as response:
@@ -70,25 +76,45 @@ def chat_with_model_streaming(user_input, history):
7076
if "content" in delta:
7177
bot_response += delta["content"]
7278
# ✅ 逐步更新聊天记录
73-
yield history + [(user_input, bot_response)], ""
79+
token_count += 1 # ✅ 每个 Token 计数
80+
yield history + [(user_input, bot_response)], "", "推理中..."
7481
except json.JSONDecodeError:
7582
pass
83+
# ✅ 记录结束时间 & 计算时长
84+
elapsed_time = time.time() - start_time
85+
tps = token_count / elapsed_time if elapsed_time > 0 else 0 # ✅ 计算 Tokens Per Second
86+
87+
speed_text = f"⏱️ 耗时: {elapsed_time:.2f} 秒 | 🔢 Tokens: {token_count} | ⚡ 速度: {tps:.2f} TPS"
88+
yield history + [(user_input, bot_response)], "", speed_text # ✅ 返回推理速度
7689

7790
except Exception as e:
7891
bot_response = f"❌ 推理失败: {str(e)}"
7992
yield history + [(user_input, bot_response)], ""
8093

94+
95+
96+
# ✅ 清除聊天记录 & 计时器
97+
def clear_chat():
98+
return [], "", "⏱️ 耗时: 0.00 秒 | 🔢 Tokens: 0 | ⚡ 速度: 0.00 TPS" # ✅ 清空所有 UI
99+
81100
# 构建 Gradio 界面
82-
with gr.Blocks() as demo:
83-
gr.Markdown("## 💬 Web UI 接入 vLLM 模型(流式输出)")
84-
chatbot = gr.Chatbot()
85-
txt = gr.Textbox(placeholder="请输入你的问题", label="输入")
86-
clear = gr.Button("清除")
87-
submit = gr.Button("提交")
101+
with gradio_musa.Blocks() as demo:
102+
# gr.Markdown("## 💬 Web UI 接入 vLLM 模型(流式输出)")
103+
chatbot = gr.Chatbot(label="Running on MTT S4000")
104+
msg_input = gr.Textbox(placeholder="请输入你的问题", label="输入...", lines=1, autofocus=True)
105+
106+
speed_display = gr.Textbox(label="推理速度", value="⏱️ 耗时: 0.00 秒 | 🔢 Tokens: 0 | ⚡ 速度: 0.00 TPS", interactive=False) # >✅ 显示推理速度
107+
108+
# clear = gr.Button("清除")
109+
# submit = gr.Button("提交")
110+
with gr.Row():
111+
submit_btn = gr.Button(value="提交")
112+
clear_btn = gr.Button("清除历史") # ✅ 添加清除按钮
88113

89114
# ✅ 使用流式函数
90-
submit.click(chat_with_model_streaming, [txt, chatbot], [chatbot, txt])
91-
txt.submit(chat_with_model_streaming, [txt, chatbot], [chatbot, txt])
92-
clear.click(lambda: ([], ""), [], [chatbot, txt])
115+
msg_input.submit(chat_with_model_streaming, inputs=[msg_input, chatbot], outputs=[chatbot, msg_input, speed_display]) # ✅ 按 Enter 触发
116+
submit_btn.click(chat_with_model_streaming, inputs=[msg_input, chatbot], outputs=[chatbot, msg_input, speed_display]) # ✅ 按按钮触发
117+
clear_btn.click(clear_chat, inputs=[], outputs=[chatbot, msg_input, speed_display]) # ✅ 清除聊天 & 计时
93118

119+
demo.queue() # ✅ 允许流式数据传输
94120
demo.launch(server_name=args.ip)

vllm/demo/gradio_demo/app.py.bak

Lines changed: 0 additions & 88 deletions
This file was deleted.

vllm/demo/gradio_demo/app_musa.py

Lines changed: 0 additions & 88 deletions
This file was deleted.

vllm/demo/gradio_demo/gradio_musa.py

Lines changed: 3 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1,13 +1,13 @@
11
import gradio as gr
22

33

4-
TITLE="在摩尔线程KUAE集群上基于MT Transformer推理引擎运行的QwQ 32B推理模型"
4+
TITLE=""
55

66
TOP = """\
77
<div class="top">
88
<div class="top-container">
99
<img class="logo" width="140" height="37" src="https://kuae-playground.mthreads.com/image/[email protected]">
10-
<h2>夸娥工场</h2>
10+
<h2>GPU GENIUS</h2>
1111
</div>
1212
</div>"""
1313

@@ -19,11 +19,7 @@
1919
window.onload = function() {
2020
document.title = "''' + TITLE + '''";
2121
}'''
22-
HEADER = TOP + "<h1>" + TITLE + "</h1><p>" + '''\
23-
在摩尔线程KUAE集群上,QwQ 32B模型通过摩尔线程推理vLLM + MT Transformer引擎高效运行。
24-
<p>QwQ 32B是Qwen系列中的推理模型。与传统的指令调优模型相比,QwQ 32B模型具有思考和推理的能力,在下游任务中,尤其是在解决困难问题时,性能显著提升。QwQ 32B是中型推理模型,能够与当前最先进的推理模型(如DeepSeek-R1和o1-mini)相媲美,展现出竞争力的表现。</p>
25-
<p>借助于<b>摩尔线程KUAE集群</b>和<b>MT Transformer引擎</b>的强大支持,QwQ 32B模型更好地应对复杂任务,推动了智能推理技术的发展。
26-
</p>'''
22+
HEADER = TOP + "<h1>" + TITLE + "</h1><p>"
2723

2824

2925

0 commit comments

Comments
 (0)