diff --git a/benchmarks/llm/scripts/vLLM/test.py b/benchmarks/llm/scripts/vLLM/test.py new file mode 100644 index 0000000..4009bfd --- /dev/null +++ b/benchmarks/llm/scripts/vLLM/test.py @@ -0,0 +1,183 @@ +import time +import argparse +import pandas as pd +import json +import os +import torch +from vllm import LLM, SamplingParams + +# 尝试导入绘图库 +try: + import matplotlib.pyplot as plt + MATPLOTLIB_AVAILABLE = True +except ImportError: + MATPLOTLIB_AVAILABLE = False + + +# --- 代表精确的目标Token数 --- +TARGET_PROMPT_TOKEN_COUNTS = [32, 64, 128, 256, 512, 1024, 2048] +MAX_TOKENS_LIST = [32, 64, 128, 256, 512, 1024, 2048] +TARGET_PROMPT_TOKEN_COUNTS = [32, 64, 128] +MAX_TOKENS_LIST = [32, 64, 128, 256] +# 使用一个通用的、非特殊的 token_id 来填充我们的输入 +FILLER_TOKEN_ID = 10 + +def plot_results(df, output_filename="vllm_benchmark_plot.png"): + + if not MATPLOTLIB_AVAILABLE: + print("\n警告: Matplotlib 未安装。运行 'pip install matplotlib' 以生成性能图表。") + return + + device_groups = df['device_ids'].unique() + prompt_token_lengths = sorted(df['prompt_tokens'].unique()) + num_prompts = len(prompt_token_lengths) + + fig, axes = plt.subplots(nrows=num_prompts, ncols=1, figsize=(12, 6 * num_prompts), squeeze=False) + fig.suptitle('VLLM Throughput Benchmark', fontsize=16, y=1.0) + + for i, prompt_len in enumerate(prompt_token_lengths): + ax = axes[i, 0] + ax.set_title(f'Performance for Input Prompt Tokens = {prompt_len}', fontsize=12) + ax.set_xlabel('Generated Output Tokens') + ax.set_ylabel('Throughput (Tokens/Sec)') + ax.grid(True, which='both', linestyle='--', linewidth=0.5) + + for group in device_groups: + subset = df[(df['prompt_tokens'] == prompt_len) & (df['device_ids'] == group)] + subset = subset.sort_values('output_tokens') + if not subset.empty: + ax.plot(subset['output_tokens'], subset['tokens_per_sec'], marker='o', linestyle='-', + label=f'GPUs: {group} (tp={subset.iloc[0]["tp_size"]})') + + ax.legend() + + plt.tight_layout(rect=[0, 0, 1, 0.98]) + plt.savefig(output_filename, dpi=300) + plt.close(fig) + print(f"\n性能图表已成功保存到: {output_filename}") + + +def run_single_benchmark(llm_engine, token_ids_list, sampling_params): + + start_time = time.time() + # 使用 prompt_token_ids 参数直接进行推理 + outputs = llm_engine.generate(prompt_token_ids=token_ids_list, sampling_params=sampling_params, use_tqdm=False) + end_time = time.time() + + total_time = end_time - start_time + result = outputs[0] + + prompt_len = len(result.prompt_token_ids) + output_len = len(result.outputs[0].token_ids) + + max_tokens_requested = sampling_params.max_tokens + if output_len < max_tokens_requested: + print(f"警告:请求生成 {max_tokens_requested} tokens,但实际只生成了 {output_len} tokens。") + + tokens_per_sec = output_len / total_time + + metrics = { + 'prompt_tokens': prompt_len, + 'output_tokens': output_len, + 'max_steps_requested': max_tokens_requested, + 'total_time_s': total_time, + 'tokens_per_sec': tokens_per_sec + } + + return metrics + +def main(): + parser = argparse.ArgumentParser( + description="运行 vLLM 性能基准测试。可以灵活配置模型、循环测试多个GPU设备组、输入和输出长度。", + formatter_class=argparse.RawTextHelpFormatter + ) + parser.add_argument("model_path", type=str, help="要测试的模型路径。") + parser.add_argument("--device-groups", nargs='+', type=str, required=True, + help="【必填】指定一个或多个要测试的GPU设备组。例如: --device-groups \"0,1\" \"0,1,2,3\"") + parser.add_argument("--dtype", type=str, default="auto", help="模型加载的数据类型。") + args = parser.parse_args() + + print(f"将使用模型进行测试: {args.model_path}") + print(f"将依次测试以下设备组: {args.device_groups}\n") + + all_results = [] + total_gpu_available = torch.cuda.device_count() + print(f"检测到系统中共有 {total_gpu_available} 个可用的GPU。") + + for device_group_str in args.device_groups: + print(f"\n{'='*25} 开始测试设备组: \"{device_group_str}\" {'='*25}") + os.environ["CUDA_VISIBLE_DEVICES"] = device_group_str + device_id_list = [int(i) for i in device_group_str.split(',')] + tp_size = len(device_id_list) + max_requested_id = max(device_id_list) + + if max_requested_id >= total_gpu_available: + print(f"错误: 设备组 \"{device_group_str}\" 请求了不存在的GPU ID {max_requested_id}。跳过此设备组。") + continue + + print(f"配置: Tensor Parallel Size = {tp_size}, 使用物理GPU ID = {device_id_list}") + print("正在初始化 vLLM 引擎...") + + try: + llm = LLM(model=args.model_path, tensor_parallel_size=tp_size, dtype=args.dtype, trust_remote_code=True) + print("vLLM 引擎初始化完成。") + except Exception as e: + print(f"错误:为设备组 \"{device_group_str}\" 初始化 vLLM 引擎失败: {e}") + print("跳过此设备组的测试。") + continue + + # --- 循环遍历精确的Token数 --- + for prompt_len in TARGET_PROMPT_TOKEN_COUNTS: + # 直接生成token_id列表 + prompt_token_ids = [FILLER_TOKEN_ID] * prompt_len + + for max_tokens in MAX_TOKENS_LIST: + # 注意:vLLM可能对总长度(输入+输出)有限制,如果组合过长可能报错 + if prompt_len + max_tokens > llm.llm_engine.model_config.max_model_len: + print(f"\n跳过: 输入({prompt_len}) + 输出({max_tokens}) > 模型最大长度({llm.llm_engine.model_config.max_model_len})") + continue + + print(f"\n{'-'*10} 测试中: Input_Tokens: {prompt_len}, Max_Output_Tokens: {max_tokens} {'-'*10}") + sampling_params = SamplingParams(n=1, temperature=0.0, max_tokens=max_tokens, ignore_eos=True) + + try: + + extracted_metrics = run_single_benchmark(llm, [prompt_token_ids], sampling_params) + if extracted_metrics: + extracted_metrics['device_ids'] = device_group_str + extracted_metrics['tp_size'] = tp_size + all_results.append(extracted_metrics) + print("成功提取到性能数据:") + formatted_metrics = {k: f"{v:.4f}" if isinstance(v, float) else v for k, v in extracted_metrics.items()} + print(json.dumps(formatted_metrics, indent=2)) + except Exception as e: + print(f"错误:在运行基准测试时发生异常: {e}") + print("跳过此测试用例。") + + print(f"\n设备组 \"{device_group_str}\" 测试完成,正在释放资源...") + del llm + torch.cuda.empty_cache() + print("资源已释放。") + + if all_results: + print(f"\n\n{'='*30} 所有测试结果汇总 {'='*30}") + df = pd.DataFrame(all_results) + cols_ordered = ['device_ids', 'tp_size', 'prompt_tokens', 'output_tokens', + 'tokens_per_sec', 'total_time_s', 'max_steps_requested'] + # 过滤掉可能不存在的列 + cols_ordered = [col for col in cols_ordered if col in df.columns] + df = df[cols_ordered] + pd.options.display.float_format = '{:,.4f}'.format + print(df.to_string(index=False)) + + output_filename = "vllm_benchmark_results_multi_group.csv" + df.to_csv(output_filename, index=False) + print(f"\n测试结果已保存到: {output_filename}") + + plot_results(df, "vllm_benchmark_plot.png") + + else: + print("\n所有测试运行完毕,但未能收集到任何性能数据。") + +if __name__ == "__main__": + main() diff --git a/benchmarks/llm/scripts/vLLM/test.sh b/benchmarks/llm/scripts/vLLM/test.sh new file mode 100644 index 0000000..694151c --- /dev/null +++ b/benchmarks/llm/scripts/vLLM/test.sh @@ -0,0 +1 @@ +python test.py Qwen/Qwen1.5-7B-Chat --device-groups "5" "5,6" "5,6,7,8"