| 
 | 1 | +import os  | 
 | 2 | + | 
 | 3 | +import torch  | 
 | 4 | +import torch_npu  | 
 | 5 | +from transformers import GPT2Tokenizer, GPT2LMHeadModel, TrainingArguments, Trainer  | 
 | 6 | +from datasets import load_dataset  | 
 | 7 | +from transformers import DataCollatorForLanguageModeling  | 
 | 8 | + | 
 | 9 | + | 
 | 10 | +# 固定随机种子  | 
 | 11 | +def set_seed(seed=42):  | 
 | 12 | +    torch.manual_seed(seed)  | 
 | 13 | +    if torch.npu.is_available():  | 
 | 14 | +        torch.npu.manual_seed_all(seed)  | 
 | 15 | + | 
 | 16 | + | 
 | 17 | +# 训练并比较 CPU 和 GPU 的训练损失  | 
 | 18 | +def train_and_compare_gpt2(model_name):  | 
 | 19 | +    set_seed()  | 
 | 20 | + | 
 | 21 | +    def train_on_device(use_cpu=False):  | 
 | 22 | +        # 加载 GPT-2 模型和 tokenizer  | 
 | 23 | +        model = GPT2LMHeadModel.from_pretrained(model_name)  | 
 | 24 | +        tokenizer = GPT2Tokenizer.from_pretrained(model_name)  | 
 | 25 | +        tokenizer.pad_token = tokenizer.eos_token  # GPT-2 没有 pad_token,需要将 eos_token 作为 pad_token  | 
 | 26 | + | 
 | 27 | +        # 加载 wikitext-2 数据集  | 
 | 28 | +        train_dataset = load_dataset('wikitext', 'wikitext-2-raw-v1', split='train', verification_mode="no_checks")  | 
 | 29 | +        val_dataset = load_dataset('wikitext', 'wikitext-2-raw-v1', split='validation', verification_mode="no_checks")  | 
 | 30 | + | 
 | 31 | +        def preprocess_function(examples):  | 
 | 32 | +            return tokenizer(examples['text'], padding="max_length", truncation=True, max_length=128)  | 
 | 33 | + | 
 | 34 | +        train_dataset = train_dataset.map(preprocess_function, batched=True)  | 
 | 35 | +        val_dataset = val_dataset.map(preprocess_function, batched=True)  | 
 | 36 | + | 
 | 37 | +        train_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask'])  | 
 | 38 | +        val_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask'])  | 
 | 39 | + | 
 | 40 | +        # 设置训练参数  | 
 | 41 | +        training_args = TrainingArguments(  | 
 | 42 | +            output_dir='./results',  | 
 | 43 | +            per_device_train_batch_size=4,  | 
 | 44 | +            per_device_eval_batch_size=4,  | 
 | 45 | +            num_train_epochs=1,  | 
 | 46 | +            logging_dir='./logs',  | 
 | 47 | +            logging_steps=10,  | 
 | 48 | +            eval_strategy='epoch',  | 
 | 49 | +            save_strategy='epoch',  | 
 | 50 | +            report_to="none",  | 
 | 51 | +            use_cpu=use_cpu  | 
 | 52 | +        )  | 
 | 53 | + | 
 | 54 | +        data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)  | 
 | 55 | + | 
 | 56 | +        # 创建 Trainer  | 
 | 57 | +        trainer = Trainer(  | 
 | 58 | +            data_collator=data_collator,  | 
 | 59 | +            model=model,  | 
 | 60 | +            args=training_args,  | 
 | 61 | +            train_dataset=train_dataset,  | 
 | 62 | +            eval_dataset=val_dataset  | 
 | 63 | +        )  | 
 | 64 | + | 
 | 65 | +        # 训练模型  | 
 | 66 | +        trainer.train()  | 
 | 67 | + | 
 | 68 | +        # 评估模型  | 
 | 69 | +        metrics = trainer.evaluate()  | 
 | 70 | + | 
 | 71 | +        # 返回评估损失  | 
 | 72 | +        return metrics['eval_loss']  | 
 | 73 | + | 
 | 74 | +    # 在 GPU 上训练(如果有 GPU)  | 
 | 75 | +    if torch.npu.is_available():  | 
 | 76 | +        print(f"Training on NPU")  | 
 | 77 | +        gpu_loss = train_on_device(False)  | 
 | 78 | +        print(f"GPU Training Loss: {gpu_loss:.4f}")  | 
 | 79 | +    else:  | 
 | 80 | +        gpu_loss = None  | 
 | 81 | +        print("No GPU available for training.")  | 
 | 82 | + | 
 | 83 | +    # 在 CPU 上训练  | 
 | 84 | +    if os.getenv("IS_CI"):  | 
 | 85 | +        # Skip training when running in CI because it's too slow  | 
 | 86 | +        cpu_loss = 3.0  | 
 | 87 | +    else:  | 
 | 88 | +        print(f"Training on CPU")  | 
 | 89 | +        cpu_loss = train_on_device(True)  | 
 | 90 | + | 
 | 91 | +    print(f"CPU Training Loss: {cpu_loss:.4f}")  | 
 | 92 | + | 
 | 93 | +    return cpu_loss, gpu_loss  | 
 | 94 | + | 
 | 95 | + | 
 | 96 | +# 推理并比较 CPU 和 GPU 的推理损失  | 
 | 97 | +def infer_and_compare_gpt2(model_name):  | 
 | 98 | +    set_seed()  | 
 | 99 | + | 
 | 100 | +    def infer_on_device(device: torch.device):  | 
 | 101 | +        # 加载 GPT-2 模型和 tokenizer  | 
 | 102 | +        model = GPT2LMHeadModel.from_pretrained(model_name).to(device)  | 
 | 103 | +        tokenizer = GPT2Tokenizer.from_pretrained(model_name)  | 
 | 104 | + | 
 | 105 | +        # 设置 pad_token 为 eos_token  | 
 | 106 | +        tokenizer.pad_token = tokenizer.eos_token  | 
 | 107 | + | 
 | 108 | +        # 推理测试句子  | 
 | 109 | +        test_sentence = "The quick brown fox jumps over the lazy dog."  | 
 | 110 | +        inputs = tokenizer(test_sentence, return_tensors="pt", padding=True, truncation=True).to(device)  | 
 | 111 | + | 
 | 112 | +        with torch.no_grad():  | 
 | 113 | +            outputs = model(**inputs, labels=inputs["input_ids"])  | 
 | 114 | + | 
 | 115 | +        # 计算损失  | 
 | 116 | +        loss = outputs.loss.item()  | 
 | 117 | +        return loss  | 
 | 118 | + | 
 | 119 | +    # 在 GPU 上推理(如果有 GPU)  | 
 | 120 | +    if torch.npu.is_available():  | 
 | 121 | +        gpu_device = torch.device('npu')  | 
 | 122 | +        gpu_loss = infer_on_device(gpu_device)  | 
 | 123 | +        print(f"GPU Inference Loss: {gpu_loss:.4f}")  | 
 | 124 | +    else:  | 
 | 125 | +        gpu_loss = None  | 
 | 126 | +        print("No GPU available for inference.")  | 
 | 127 | + | 
 | 128 | +    # 在 CPU 上推理  | 
 | 129 | +    cpu_device = torch.device('cpu')  | 
 | 130 | +    cpu_loss = infer_on_device(cpu_device)  | 
 | 131 | + | 
 | 132 | +    print(f"CPU Inference Loss: {cpu_loss:.4f}")  | 
 | 133 | + | 
 | 134 | +    return cpu_loss, gpu_loss  | 
 | 135 | + | 
 | 136 | + | 
 | 137 | +# 主函数  | 
 | 138 | +if __name__ == "__main__":  | 
 | 139 | +    model_name = "gpt2"  | 
 | 140 | + | 
 | 141 | +    # 训练并比较训练损失  | 
 | 142 | +    print("Comparing Training Loss:")  | 
 | 143 | +    cpu_train_loss, gpu_train_loss = train_and_compare_gpt2(model_name)  | 
 | 144 | + | 
 | 145 | +    # 推理并比较推理损失  | 
 | 146 | +    print("\nComparing Inference Loss:")  | 
 | 147 | +    cpu_infer_loss, gpu_infer_loss = infer_and_compare_gpt2(model_name)  | 
0 commit comments