Skip to content

Commit 5084245

Browse files
authored
[Test] Add GPT2 (#7)
1 parent 643430b commit 5084245

File tree

5 files changed

+167
-5
lines changed

5 files changed

+167
-5
lines changed

.github/actions/dependencies-action/action.yml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,7 @@ inputs:
2929
runs:
3030
using: composite
3131
steps:
32+
# TODO(shink): Remove this step after building devel images
3233
- name: Install system dependencies
3334
shell: bash
3435
env:

.github/workflows/_build-and-test.yml

Lines changed: 14 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
name: '[Internal] Build and Test'
1+
name: 'build-test'
22

33
on:
44
workflow_call:
@@ -18,6 +18,7 @@ on:
1818

1919
# Bash shells do not use ~/.profile or ~/.bashrc so these shells need to be explicitly
2020
# declared as "shell: bash -el {0}" on steps that need to be properly activated.
21+
# It's used to activate ascend-toolkit environment variables.
2122
defaults:
2223
run:
2324
shell: bash -el {0}
@@ -123,7 +124,6 @@ jobs:
123124
- wheel
124125
- unittest-xml-reporting
125126
pip_requirements: |
126-
- /root/build/npu/pytorch/requirements.txt
127127
- /root/build/npu/pytorch/requirements.txt
128128
- /root/build/npu/pytorch/test/requirements.txt --no-deps
129129
@@ -132,10 +132,21 @@ jobs:
132132
run: |
133133
pip install ${{ needs.build.outputs.dist_name }}
134134
135+
# TODO(shink): Skip
135136
- name: Do the test
136-
continue-on-error: true # Skip if failed
137+
continue-on-error: true
137138
working-directory: /root/build
138139
run: |
139140
python npu/pytorch/ci/access_control_test.py
140141
env:
141142
DISABLED_TESTS_FILE: /root/build/npu/pytorch/test/unsupported_test_cases/.pytorch-disabled-tests.json
143+
144+
- name: Train GPT2
145+
working-directory: ./test
146+
run: |
147+
pip install -r requirements.txt
148+
pip install accelerate -U
149+
python gpt2_test.py
150+
env:
151+
IS_CI: true
152+
HF_ENDPOINT: https://hf-mirror.com

.github/workflows/ascend_npu_test.yml

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -49,7 +49,8 @@ on:
4949
options:
5050
- ascendai/cann:7.1-openeuler2203sp2
5151
- ascendai/cann:8.0.rc2.alpha003-910b-ubuntu22.04-py3.9
52-
default: 'ascendai/cann:8.0.rc2.alpha003-910b-ubuntu22.04-py3.9'
52+
- ascendai/cann:8.0.rc3.alpha002-910b-ubuntu22.04-py3.9
53+
default: 'ascendai/cann:8.0.rc3.alpha002-910b-ubuntu22.04-py3.9'
5354
description: 'The docker image which will be loaded'
5455

5556
# Only cancel the previous runs when triggered by a pull request
@@ -72,7 +73,7 @@ jobs:
7273
set -e
7374
echo "runner=${{ github.event.inputs.runner || 'self-hosted' }}" >> $GITHUB_OUTPUT
7475
echo "device=${{ github.event.inputs.device || '/dev/davinci6' }}" >> $GITHUB_OUTPUT
75-
echo "image=${{ github.event.inputs.image || 'ascendai/cann:8.0.rc2.alpha003-910b-ubuntu22.04-py3.9' }}" >> $GITHUB_OUTPUT
76+
echo "image=${{ github.event.inputs.image || 'ascendai/cann:8.0.rc3.alpha002-910b-ubuntu22.04-py3.9' }}" >> $GITHUB_OUTPUT
7677
7778
fetch-and-rebase:
7879
name: Fetch and rebase

test/gpt2_test.py

Lines changed: 147 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,147 @@
1+
import os
2+
3+
import torch
4+
import torch_npu
5+
from transformers import GPT2Tokenizer, GPT2LMHeadModel, TrainingArguments, Trainer
6+
from datasets import load_dataset
7+
from transformers import DataCollatorForLanguageModeling
8+
9+
10+
# 固定随机种子
11+
def set_seed(seed=42):
12+
torch.manual_seed(seed)
13+
if torch.npu.is_available():
14+
torch.npu.manual_seed_all(seed)
15+
16+
17+
# 训练并比较 CPU 和 GPU 的训练损失
18+
def train_and_compare_gpt2(model_name):
19+
set_seed()
20+
21+
def train_on_device(use_cpu=False):
22+
# 加载 GPT-2 模型和 tokenizer
23+
model = GPT2LMHeadModel.from_pretrained(model_name)
24+
tokenizer = GPT2Tokenizer.from_pretrained(model_name)
25+
tokenizer.pad_token = tokenizer.eos_token # GPT-2 没有 pad_token,需要将 eos_token 作为 pad_token
26+
27+
# 加载 wikitext-2 数据集
28+
train_dataset = load_dataset('wikitext', 'wikitext-2-raw-v1', split='train', verification_mode="no_checks")
29+
val_dataset = load_dataset('wikitext', 'wikitext-2-raw-v1', split='validation', verification_mode="no_checks")
30+
31+
def preprocess_function(examples):
32+
return tokenizer(examples['text'], padding="max_length", truncation=True, max_length=128)
33+
34+
train_dataset = train_dataset.map(preprocess_function, batched=True)
35+
val_dataset = val_dataset.map(preprocess_function, batched=True)
36+
37+
train_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask'])
38+
val_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask'])
39+
40+
# 设置训练参数
41+
training_args = TrainingArguments(
42+
output_dir='./results',
43+
per_device_train_batch_size=4,
44+
per_device_eval_batch_size=4,
45+
num_train_epochs=1,
46+
logging_dir='./logs',
47+
logging_steps=10,
48+
eval_strategy='epoch',
49+
save_strategy='epoch',
50+
report_to="none",
51+
use_cpu=use_cpu
52+
)
53+
54+
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)
55+
56+
# 创建 Trainer
57+
trainer = Trainer(
58+
data_collator=data_collator,
59+
model=model,
60+
args=training_args,
61+
train_dataset=train_dataset,
62+
eval_dataset=val_dataset
63+
)
64+
65+
# 训练模型
66+
trainer.train()
67+
68+
# 评估模型
69+
metrics = trainer.evaluate()
70+
71+
# 返回评估损失
72+
return metrics['eval_loss']
73+
74+
# 在 GPU 上训练(如果有 GPU)
75+
if torch.npu.is_available():
76+
print(f"Training on NPU")
77+
gpu_loss = train_on_device(False)
78+
print(f"GPU Training Loss: {gpu_loss:.4f}")
79+
else:
80+
gpu_loss = None
81+
print("No GPU available for training.")
82+
83+
# 在 CPU 上训练
84+
if os.getenv("IS_CI"):
85+
# Skip training when running in CI because it's too slow
86+
cpu_loss = 3.0
87+
else:
88+
print(f"Training on CPU")
89+
cpu_loss = train_on_device(True)
90+
91+
print(f"CPU Training Loss: {cpu_loss:.4f}")
92+
93+
return cpu_loss, gpu_loss
94+
95+
96+
# 推理并比较 CPU 和 GPU 的推理损失
97+
def infer_and_compare_gpt2(model_name):
98+
set_seed()
99+
100+
def infer_on_device(device: torch.device):
101+
# 加载 GPT-2 模型和 tokenizer
102+
model = GPT2LMHeadModel.from_pretrained(model_name).to(device)
103+
tokenizer = GPT2Tokenizer.from_pretrained(model_name)
104+
105+
# 设置 pad_token 为 eos_token
106+
tokenizer.pad_token = tokenizer.eos_token
107+
108+
# 推理测试句子
109+
test_sentence = "The quick brown fox jumps over the lazy dog."
110+
inputs = tokenizer(test_sentence, return_tensors="pt", padding=True, truncation=True).to(device)
111+
112+
with torch.no_grad():
113+
outputs = model(**inputs, labels=inputs["input_ids"])
114+
115+
# 计算损失
116+
loss = outputs.loss.item()
117+
return loss
118+
119+
# 在 GPU 上推理(如果有 GPU)
120+
if torch.npu.is_available():
121+
gpu_device = torch.device('npu')
122+
gpu_loss = infer_on_device(gpu_device)
123+
print(f"GPU Inference Loss: {gpu_loss:.4f}")
124+
else:
125+
gpu_loss = None
126+
print("No GPU available for inference.")
127+
128+
# 在 CPU 上推理
129+
cpu_device = torch.device('cpu')
130+
cpu_loss = infer_on_device(cpu_device)
131+
132+
print(f"CPU Inference Loss: {cpu_loss:.4f}")
133+
134+
return cpu_loss, gpu_loss
135+
136+
137+
# 主函数
138+
if __name__ == "__main__":
139+
model_name = "gpt2"
140+
141+
# 训练并比较训练损失
142+
print("Comparing Training Loss:")
143+
cpu_train_loss, gpu_train_loss = train_and_compare_gpt2(model_name)
144+
145+
# 推理并比较推理损失
146+
print("\nComparing Inference Loss:")
147+
cpu_infer_loss, gpu_infer_loss = infer_and_compare_gpt2(model_name)

test/requirements.txt

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
transformers==4.44.2
2+
datasets==2.21.0

0 commit comments

Comments
 (0)