Merge pull request #41 from MooreThreads/add_webUI_demo

caizhi-mt · web-flow · commit c526eb3acc5e · 2025-04-14T20:48:56.000+08:00
fix some problems for vllm demo
diff --git a/vllm/demo/gradio_demo/app.py b/vllm/demo/gradio_demo/app.py
@@ -57,6 +57,7 @@ def chat_with_model_streaming(user_input, history):
     # ✅ 记录开始时间
     start_time = time.time()
     token_count = 0  # ✅ 记录生成的 Token 数量
+    first_token_time = None
 
     try:
         # ✅ 使用 requests 的流式请求
@@ -77,14 +78,17 @@ def chat_with_model_streaming(user_input, history):
                                     bot_response += delta["content"]
                                     # ✅ 逐步更新聊天记录
                                     token_count += 1  # ✅ 每个 Token 计数  
+                                    if first_token_time is None and token_count > 0:
+                                        first_token_time = time.time()
+
                                     yield history + [(user_input, bot_response)], "", "推理中..."
                             except json.JSONDecodeError:
                                 pass
             # ✅ 记录结束时间 & 计算时长
-            elapsed_time = time.time() - start_time
+            first_token_latency = first_token_time - start_time if first_token_time is not None else 0
+            elapsed_time = time.time() - first_token_time
             tps = token_count / elapsed_time if elapsed_time > 0 else 0  # ✅ 计算 Tokens Per Second
-
-            speed_text = f"⏱️  耗时: {elapsed_time:.2f} 秒 | 🔢 Tokens: {token_count} | ⚡ 速度: {tps:.2f} TPS"
+            speed_text = f"⏳ 首字延迟: {first_token_latency:.2f} 秒 | ⏱️  耗时: {elapsed_time:.2f} 秒 | 🔢 Tokens: {token_count} | ⚡ 速度: {tps:.2f} TPS" # ⏳
             yield history + [(user_input, bot_response)], "", speed_text  # ✅ 返回推理速度
 
     except Exception as e:
@@ -95,15 +99,15 @@ def chat_with_model_streaming(user_input, history):
 
 # ✅ 清除聊天记录 & 计时器
 def clear_chat():
-    return [], "", "⏱️  耗时: 0.00 秒 | 🔢 Tokens: 0 | ⚡ 速度: 0.00 TPS"  # ✅ 清空所有 UI
+    return [], "", "⏳ 首字延迟: 0.00 秒 | ⏱️  耗时: 0.00 秒 | 🔢 Tokens: 0 | ⚡ 速度: 0.00 TPS"  # ✅ 清空所有 UI
 
 # 构建 Gradio 界面
 with gradio_musa.Blocks() as demo:
     # gr.Markdown("## 💬 Web UI 接入 vLLM 模型（流式输出）")
     chatbot = gr.Chatbot(label="Running on MTT S4000")
     msg_input = gr.Textbox(placeholder="请输入你的问题", label="输入...", lines=1, autofocus=True)
 
-    speed_display = gr.Textbox(label="推理速度", value="⏱️  耗时: 0.00 秒 | 🔢 Tokens: 0 | ⚡ 速度: 0.00 TPS", interactive=False)  # >✅ 显示推理速度
+    speed_display = gr.Textbox(label="推理速度", value="⏳ 首字延迟: 0.00 秒 | ⏱️  耗时: 0.00 秒 | 🔢 Tokens: 0 | ⚡ 速度: 0.00 TPS", interactive=False)  # >✅ 显示推理速度
 
     # clear = gr.Button("清除")
     # submit = gr.Button("提交")
diff --git a/vllm/demo/gradio_demo/gradio_musa.py b/vllm/demo/gradio_demo/gradio_musa.py
@@ -7,7 +7,6 @@
 <div class="top">
         <div class="top-container">
                 <img class="logo" width="140" height="37" src="https://kuae-playground.mthreads.com/image/logo@2x.png">
-                <h2>GPU GENIUS</h2>
         </div>
 </div>"""
 
diff --git a/vllm/demo/run_vllm_serving.sh b/vllm/demo/run_vllm_serving.sh
@@ -241,6 +241,7 @@ wait_for_log_update() {
                 echo -e "\e[32mInstalling gradio...\e[0m"  >&2
                 pip install gradio
                 echo -e "\e[32mStart gradio webui...\e[0m"  >&2
+                echo -e "\e[32mContainer: $CONTAINER_NAME\e[0m"  >&2
                 setsid python -u ./gradio_demo/app.py --ip "$host" --port "$port" --model-name "$model_name" | tee -a webui.log &
                 wait $!  # 等待该进程结束
                 exit 0
diff --git a/vllm/demo/supported_models.json b/vllm/demo/supported_models.json
@@ -25,5 +25,12 @@
         "modelscope_url": "https://www.modelscope.cn/Qwen/Qwen2.5-0.5B-Instruct.git",
         "huggingface_url": "https://huggingface.co/Qwen/Qwen2.5-0.5B-Instruct",
         "tensor_parallel_size": [1]
+    },
+
+    "qwq-32b": {
+        "name": "Qwen/QwQ-32B",
+        "modelscope_url": "https://www.modelscope.cn/Qwen/QwQ-32B.git",
+        "huggingface_url": "https://huggingface.co/Qwen/QwQ-32B",
+        "tensor_parallel_size": [2, 4]
     }
 }

Original file line number	Diff line number	Diff line change
`@@ -25,5 +25,12 @@`
`25`	`25`	`"modelscope_url": "https://www.modelscope.cn/Qwen/Qwen2.5-0.5B-Instruct.git",`
`26`	`26`	`"huggingface_url": "https://huggingface.co/Qwen/Qwen2.5-0.5B-Instruct",`
`27`	`27`	`"tensor_parallel_size": [1]`
	`28`	`+ },`
	`29`	`+`
	`30`	`+ "qwq-32b": {`
	`31`	`+ "name": "Qwen/QwQ-32B",`
	`32`	`+ "modelscope_url": "https://www.modelscope.cn/Qwen/QwQ-32B.git",`
	`33`	`+ "huggingface_url": "https://huggingface.co/Qwen/QwQ-32B",`
	`34`	`+ "tensor_parallel_size": [2, 4]`
`28`	`35`	`}`
`29`	`36`	`}`