Skip to content

Commit d4c2ff0

Browse files
committed
update app.py with adding first token latency
1 parent de97dc0 commit d4c2ff0

File tree

1 file changed

+9
-5
lines changed

1 file changed

+9
-5
lines changed

vllm/demo/gradio_demo/app.py

Lines changed: 9 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -57,6 +57,7 @@ def chat_with_model_streaming(user_input, history):
5757
# ✅ 记录开始时间
5858
start_time = time.time()
5959
token_count = 0 # ✅ 记录生成的 Token 数量
60+
first_token_time = None
6061

6162
try:
6263
# ✅ 使用 requests 的流式请求
@@ -77,14 +78,17 @@ def chat_with_model_streaming(user_input, history):
7778
bot_response += delta["content"]
7879
# ✅ 逐步更新聊天记录
7980
token_count += 1 # ✅ 每个 Token 计数
81+
if first_token_time is None and token_count > 0:
82+
first_token_time = time.time()
83+
8084
yield history + [(user_input, bot_response)], "", "推理中..."
8185
except json.JSONDecodeError:
8286
pass
8387
# ✅ 记录结束时间 & 计算时长
84-
elapsed_time = time.time() - start_time
88+
first_token_latency = first_token_time - start_time if first_token_time is not None else 0
89+
elapsed_time = time.time() - first_token_time
8590
tps = token_count / elapsed_time if elapsed_time > 0 else 0 # ✅ 计算 Tokens Per Second
86-
87-
speed_text = f"⏱️ 耗时: {elapsed_time:.2f} 秒 | 🔢 Tokens: {token_count} | ⚡ 速度: {tps:.2f} TPS"
91+
speed_text = f"⏳ 首字延迟: {first_token_latency:.2f} | ⏱️ 耗时: {elapsed_time:.2f} 秒 | 🔢 Tokens: {token_count} | ⚡ 速度: {tps:.2f} TPS" # ⏳
8892
yield history + [(user_input, bot_response)], "", speed_text # ✅ 返回推理速度
8993

9094
except Exception as e:
@@ -95,15 +99,15 @@ def chat_with_model_streaming(user_input, history):
9599

96100
# ✅ 清除聊天记录 & 计时器
97101
def clear_chat():
98-
return [], "", "⏱️ 耗时: 0.00 秒 | 🔢 Tokens: 0 | ⚡ 速度: 0.00 TPS" # ✅ 清空所有 UI
102+
return [], "", "⏳ 首字延迟: 0.00 秒 | ⏱️ 耗时: 0.00 秒 | 🔢 Tokens: 0 | ⚡ 速度: 0.00 TPS" # ✅ 清空所有 UI
99103

100104
# 构建 Gradio 界面
101105
with gradio_musa.Blocks() as demo:
102106
# gr.Markdown("## 💬 Web UI 接入 vLLM 模型(流式输出)")
103107
chatbot = gr.Chatbot(label="Running on MTT S4000")
104108
msg_input = gr.Textbox(placeholder="请输入你的问题", label="输入...", lines=1, autofocus=True)
105109

106-
speed_display = gr.Textbox(label="推理速度", value="⏱️ 耗时: 0.00 秒 | 🔢 Tokens: 0 | ⚡ 速度: 0.00 TPS", interactive=False) # >✅ 显示推理速度
110+
speed_display = gr.Textbox(label="推理速度", value="⏳ 首字延迟: 0.00 秒 | ⏱️ 耗时: 0.00 秒 | 🔢 Tokens: 0 | ⚡ 速度: 0.00 TPS", interactive=False) # >✅ 显示推理速度
107111

108112
# clear = gr.Button("清除")
109113
# submit = gr.Button("提交")

0 commit comments

Comments
 (0)