diff --git a/test/bench/test_benchmark.py b/test/bench/test_benchmark.py index f2e29e98..ec09a087 100644 --- a/test/bench/test_benchmark.py +++ b/test/bench/test_benchmark.py @@ -60,14 +60,15 @@ def __init__( self.benchmark = benchmark # Map device type string to infinicore device + # Note: metax, moore, iluvatar use CUDA-compatible interfaces device_map = { "cpu": "cpu", "nvidia": "cuda", "cambricon": "cambricon", "ascend": "ascend", - "metax": "metax", - "moore": "moore", - "iluvatar": "iluvatar", + "metax": "cuda", + "moore": "musa", # moore uses musa interface + "iluvatar": "cuda", "kunlun": "kunlun", "hygon": "hygon", } @@ -699,7 +700,7 @@ def test(): device_type_str = "hygon" else: print( - "Usage: python test_benchmark.py [--cpu | --nvidia| --cambricon | --ascend | --metax | --moore | --iluvatar | --kunlun | --hygon] --bench [ceval|mmlu] [--backend cpp|torch] [--ndev N] [--subject SUBJECT] [--num_samples N] [--max_new_tokens N] [--output_csv PATH] [--cache_dir PATH]" + "Usage: python test_benchmark.py [--cpu | --nvidia| --cambricon | --ascend | --metax | --moore | --iluvatar | --kunlun | --hygon] --bench [ceval|mmlu] [--backend cpp|torch] [--ndev N] [--subject SUBJECT] [--split {test|val|all}] [--num_samples N] [--max_new_tokens N] [--output_csv PATH] [--cache_dir PATH]" ) sys.exit(1) diff --git a/test/bench/transformers_inference_test.py b/test/bench/transformers_inference_test.py new file mode 100644 index 00000000..3f9c7eef --- /dev/null +++ b/test/bench/transformers_inference_test.py @@ -0,0 +1,161 @@ +#!/usr/bin/env python3 +""" +最简单的transformers模型推理脚本 +用于快速测试模型输出 +""" + +import torch +import argparse +import warnings +import os +import time +from transformers import AutoModelForCausalLM, AutoTokenizer + +# 过滤 transformers 的警告 +warnings.filterwarnings("ignore") +# 设置环境变量以抑制 generation flags 警告 +os.environ['TRANSFORMERS_VERBOSITY'] = 'error' + + +def main(): + parser = argparse.ArgumentParser(description='简单的模型推理脚本') + parser.add_argument('--model_path', type=str, required=True, + help='模型路径') + parser.add_argument('--prompt', type=str, default='你好,请介绍一下你自己。', + help='输入提示词') + parser.add_argument('--max_new_tokens', type=int, default=512, + help='最大生成token数') + parser.add_argument('--device', type=str, default=None, + help='指定设备,如: cuda:0 (默认自动选择)') + parser.add_argument('--dtype', type=str, default='float16', + choices=['float16', 'float32'], + help='数据类型') + + args = parser.parse_args() + + # 设置设备 + if args.device: + device = torch.device(args.device) + elif torch.cuda.is_available(): + device = torch.device('cuda:0') + else: + device = torch.device('cpu') + + print(f"使用设备: {device}") + print(f"加载模型: {args.model_path}") + + # 加载tokenizer + tokenizer = AutoTokenizer.from_pretrained(args.model_path, trust_remote_code=True) + if tokenizer.pad_token is None: + tokenizer.pad_token = tokenizer.eos_token + + # 加载模型 + dtype_map = { + 'float16': torch.float16, + 'float32': torch.float32 + } + model = AutoModelForCausalLM.from_pretrained( + args.model_path, + dtype=dtype_map[args.dtype], + trust_remote_code=True + ) + model = model.to(device) + model.eval() + + # 编码输入 + print(f"\n输入提示: {args.prompt}") + encoded = tokenizer(args.prompt, return_tensors='pt') + input_ids = encoded['input_ids'].to(device) + attention_mask = encoded['attention_mask'].to(device) + + input_length = input_ids.shape[1] + pad_token_id = tokenizer.pad_token_id + + # 预热(可选) + if torch.cuda.is_available(): + torch.cuda.synchronize() + + # 生成回答并测量时间 + print("\n生成中...") + with torch.no_grad(): + # Prefill 阶段:处理输入 prompt + if torch.cuda.is_available(): + torch.cuda.synchronize() + prefill_start = time.time() + + # 第一次前向传播(prefill) + outputs = model(input_ids=input_ids, attention_mask=attention_mask, use_cache=True) + past_key_values = outputs.past_key_values + next_token_logits = outputs.logits[:, -1, :] + next_token_id = torch.argmax(next_token_logits, dim=-1).unsqueeze(1) + + if torch.cuda.is_available(): + torch.cuda.synchronize() + prefill_end = time.time() + prefill_time = prefill_end - prefill_start + + # Decode 阶段:生成新 tokens + generated_ids = [input_ids, next_token_id] + decode_times = [] + + for _ in range(args.max_new_tokens - 1): + if torch.cuda.is_available(): + torch.cuda.synchronize() + decode_start = time.time() + + # 使用 past_key_values 进行增量生成 + outputs = model( + input_ids=next_token_id, + past_key_values=past_key_values, + use_cache=True + ) + past_key_values = outputs.past_key_values + next_token_logits = outputs.logits[:, -1, :] + next_token_id = torch.argmax(next_token_logits, dim=-1).unsqueeze(1) + generated_ids.append(next_token_id) + + if torch.cuda.is_available(): + torch.cuda.synchronize() + decode_end = time.time() + decode_times.append(decode_end - decode_start) + + # 检查是否生成结束符 + if next_token_id.item() == tokenizer.eos_token_id: + break + + # 合并所有生成的 tokens + outputs = torch.cat(generated_ids, dim=1) + + # 计算统计信息 + decode_time_total = sum(decode_times) + decode_count = len(decode_times) + generated_length = outputs.shape[1] - input_length + + prefill_throughput = input_length / prefill_time if prefill_time > 0 else 0 + decode_throughput = decode_count / decode_time_total if decode_time_total > 0 else 0 + avg_decode_time = decode_time_total / decode_count if decode_count > 0 else 0 + + # 解码输出 + response = tokenizer.decode(outputs[0], skip_special_tokens=True) + + # 打印结果 + print("\n" + "="*80) + print("模型回答:") + print("="*80) + print(response) + print("="*80) + print("\n性能统计:") + print("="*80) + print(f"输入长度: {input_length} tokens") + print(f"生成长度: {generated_length} tokens") + print(f"Prefill 时间: {prefill_time*1000:.2f} ms") + print(f"Prefill 吞吐率: {prefill_throughput:.2f} tokens/s") + print(f"Decode 总时间: {decode_time_total*1000:.2f} ms") + print(f"Decode 平均时间: {avg_decode_time*1000:.2f} ms/token") + print(f"Decode 吞吐率: {decode_throughput:.2f} tokens/s") + print(f"总时间: {(prefill_time + decode_time_total)*1000:.2f} ms") + print("="*80) + + +if __name__ == "__main__": + main()