Skip to content

Commit 1914c8a

Browse files
authored
Adopt new data pipeline for GPT. (PaddlePaddle#930)
1 parent 27727b1 commit 1914c8a

File tree

10 files changed

+214
-168
lines changed

10 files changed

+214
-168
lines changed

examples/language_model/data_tools/README.md

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -30,8 +30,10 @@
3030
- tqdm
3131
- numpy
3232
- pybind11
33+
- lac (可选)
34+
- zstandard (可选)
3335

34-
安装命令`pip install tqdm numpy pybind11`。另,部分功能需要`g++>=4.8`编译支持
36+
安装命令`pip install tqdm numpy pybind11 lac zstandard`。另,部分功能需要`g++>=4.8`编译支持
3537

3638

3739
## 训练全流程数据Pipeline
@@ -179,6 +181,12 @@ sh run_static.sh
179181
可以自定义的选项有do_whole_word_mask, favor_longer_ngram, do_permutation, geometric_dist等,
180182
可以参考[Megatron](https://github.com/NVIDIA/Megatron-LM)使用这些lm_mask策略。
181183

184+
### FAQ
185+
186+
#### C++代码编译失败怎么办?
187+
- 请先检查pybind11包是否安装,g++、make工具是否正常。
188+
- 编译失败可能是本文件夹下的Makefile命令出现了一些问题。可以将Makefile中的python3、python3-config设置成完全的路径,如/usr/bin/python3.7。
189+
182190
## 参考内容
183191

184192
注: 大部分数据流程,参考自[Megatron](https://github.com/NVIDIA/Megatron-LM),特此表达感谢。

examples/language_model/data_tools/create_pretraining_data.py

Lines changed: 29 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,12 @@
2626

2727
import paddlenlp.transformers as tfs
2828

29+
try:
30+
import nltk
31+
nltk_available = True
32+
except ImportError:
33+
nltk_available = False
34+
2935

3036
def get_args():
3137
parser = argparse.ArgumentParser()
@@ -137,7 +143,6 @@ def process(line):
137143

138144
def jieba_segmentation_fn():
139145
import jieba
140-
jieba.initialize() # 手动初始化(可选)
141146

142147
def process(line):
143148
words = jieba.cut(line)
@@ -280,12 +285,11 @@ def encode(self, json_line):
280285
if len(doc_ids) > 0 and self.args.append_eos:
281286
doc_ids[-1].append(Converter.tokenizer.eos_token_id)
282287

283-
return doc_ids, len(json_line.encode("utf-8"))
288+
return doc_ids, len(text.encode("utf-8"))
284289

285290

286291
def main():
287292
args = get_args()
288-
startup_start = time.time()
289293

290294
file_paths = []
291295
if os.path.isfile(args.input_path):
@@ -318,16 +322,28 @@ def main():
318322

319323
sent_count = 0
320324
token_count = 0
325+
326+
file_paths.sort()
327+
328+
step = 0
329+
total_bytes_processed = 0
330+
startup_start = time.time()
321331
for file_path in tqdm(file_paths):
322-
total_bytes_processed = 0
323-
text = open(file_path, 'r', encoding='utf-8')
324-
encoded_docs = pool.imap(convert.encode, text, 256)
332+
if file_path.endswith(".zst"):
333+
import zstandard
334+
cctx = zstandard.ZstdDecompressor()
335+
fh = open(file_path, 'rb')
336+
text = io.BufferedReader(cctx.stream_reader(fh))
337+
elif file_path.endswith(".jsonl"):
338+
text = open(file_path, 'r', encoding='utf-8')
339+
else:
340+
print("Unexpected data format, skiped %s" % file_path)
341+
continue
325342

326-
startup_end = time.time()
327-
proc_start = time.time()
328-
print("Time to startup:", startup_end - startup_start)
343+
encoded_docs = pool.imap(convert.encode, text, 256)
329344
print("Processing %s" % file_path)
330345
for i, (doc, bytes_processed) in enumerate(encoded_docs, start=1):
346+
step += 1
331347
total_bytes_processed += bytes_processed
332348
if len(doc) == 0:
333349
continue
@@ -352,13 +368,13 @@ def main():
352368
sent_count.to_bytes(
353369
8, byteorder='little', signed=True))
354370

355-
if i % args.log_interval == 0:
371+
if step % args.log_interval == 0:
356372
current = time.time()
357-
elapsed = current - proc_start
373+
elapsed = current - startup_start
358374
mbs = total_bytes_processed / elapsed / 1024 / 1024
359375
print(
360-
f"Processed {i} documents",
361-
f"({i/elapsed:.2f} docs/s, {mbs:.4f} MB/s).",
376+
f"Processed {step} documents",
377+
f"({step/elapsed:.2f} docs/s, {mbs:.4f} MB/s).",
362378
file=sys.stderr)
363379

364380
pool.close()

examples/language_model/ernie-1.0/run_pretrain_static.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -644,7 +644,7 @@ def do_train(args):
644644
if global_step >= args.max_steps:
645645
eval_fetch = []
646646
if topo.is_last:
647-
eval_fetch = [loss]
647+
eval_fetch = [loss, lm_loss, sop_loss]
648648

649649
run_evaluate(test_data_loader, exe, test_program,
650650
args.test_iters, log_writer, global_step, args,

examples/language_model/gpt-3/static/run_pretrain_static.py

Lines changed: 22 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@
1919
import os
2020
import random
2121
import time
22+
import sys
2223

2324
os.path.expandvars('$HOME')
2425
os.path.expanduser('~')
@@ -34,6 +35,8 @@
3435
import paddlenlp.ops as ops
3536
from visualdl import LogWriter
3637

38+
# Used to load the data_tools path, should import before dataset
39+
sys.path.insert(0, "../../")
3740
from dataset import create_pretrained_dataset
3841
from args import parse_args
3942
import lr
@@ -121,12 +124,25 @@ def dist_optimizer(args, topo):
121124
def get_train_data_file(args):
122125
files = [
123126
os.path.join(args.input_dir, f) for f in os.listdir(args.input_dir)
124-
if (os.path.isfile(os.path.join(args.input_dir, f)) and "npz_" not in
125-
str(f))
127+
if (os.path.isfile(os.path.join(args.input_dir, f)) and str(f).endswith(
128+
"_idx.npz"))
129+
]
130+
files = [x.replace("_idx.npz", "") for x in files]
131+
if len(files) == 0:
132+
logger.warning(
133+
"Not found dataset with name of xxx_ids.npy and xxx_idx.npz! Try to found old compatible xxx_ids.npz file."
134+
)
135+
else:
136+
return files
137+
138+
files = [
139+
os.path.join(args.input_dir, f) for f in os.listdir(args.input_dir)
140+
if (os.path.isfile(os.path.join(args.input_dir, f)) and str(f).endswith(
141+
"_ids.npz"))
126142
]
127143

128-
data_file = files[0]
129-
return data_file
144+
files = [x.replace("_ids.npz", "") for x in files]
145+
return files
130146

131147

132148
def init_static_with_params(model, dygraph_params, topo, prog=None):
@@ -189,6 +205,7 @@ def do_train(args):
189205

190206
worker_num = fleet.worker_num()
191207
worker_index = fleet.worker_index()
208+
local_rank = 0 if fleet.local_rank() is None else int(fleet.local_rank())
192209

193210
topo = Topology(
194211
device_rank=worker_index,
@@ -237,6 +254,7 @@ def do_train(args):
237254
train_data_loader, valid_data_loader, test_data_loader = create_pretrained_dataset(
238255
args,
239256
data_file,
257+
local_rank=local_rank,
240258
data_world_size=topo.data_info.size,
241259
data_world_rank=topo.data_info.rank,
242260
eos_id=eos_id,

examples/language_model/gpt/README.md

Lines changed: 22 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -8,10 +8,8 @@ GPT-[2](https://cdn.openai.com/better-language-models/language_models_are_unsupe
88
```text
99
.
1010
├── args.py # 训练参数配置
11-
├── create_pretraining_data.py # 数据预处理脚本
1211
├── converter.py # 权重转化脚本
1312
├── dataset.py # 数据处理
14-
├── decompress.sh # 数据集解压脚本
1513
├── deploy/ # 模型部署的inference脚本
1614
├── export_model.py # 导出预测部署的模型脚本
1715
├── faster_gpt/ # 使用 FasterGPT 高性能预测 sample
@@ -39,43 +37,45 @@ GPT-[2](https://cdn.openai.com/better-language-models/language_models_are_unsupe
3937

4038
### 数据准备
4139

42-
#### 原始数据获取
40+
#### 数据获取与制作
4341

4442
[OpenWebTextCorpus](https://skylion007.github.io/OpenWebTextCorpus/)是一个开源的英文网页文本数据集,数据来源于Reddit,经过去重、清洗、提取,最终包含800多万个文档。
43+
本示例采用EleutherAI清洗好的[OpenWebText2数据](https://openwebtext2.readthedocs.io/en/latest/index.html#download-plug-and-play-version)
4544

4645
下载以后通过以下命令解压:
4746

4847
```shell
49-
xz -d openwebtext.tar.xz
50-
tar xf openwebtext.tar
51-
mkdir raw_data
52-
bash decompress.sh
48+
wget https://the-eye.eu/public/AI/pile_preliminary_components/openwebtext2.jsonl.zst.tar ./
49+
tar -xvf openwebtext2.json.zst.tar -C /path/to/openwebtext
5350
```
5451

55-
解压以后得到的`raw_data`目录大小约为54GB。
52+
然后使用[data_tools](../data_tools)工具下的`create_pretraining_data.py`脚本进行数据集制作:
5653

57-
#### 数据预处理
58-
59-
为了提升训练速度,我们在训练前将文本数据转成相应的id,并保存为npz格式:
60-
61-
```shell
62-
python create_pretraining_data.py --input_path raw_data \
63-
--model_name gpt2-en \
64-
--append_eod \
65-
--workers 8
6654
```
55+
python -u create_pretraining_data.py \
56+
--model_name gpt2-en \
57+
--tokenizer_name GPTTokenizer \
58+
--data_format JSON \
59+
--input_path /path/to/openwebtext/ \
60+
--append_eos \
61+
--output_prefix gpt_openwebtext \
62+
--workers 40 \
63+
--log_interval 10000
64+
```
65+
处理时间约一个小时左右,就可以得到我们需要的`gpt_openwebtext_ids.npy`, `gpt_openwebtext_idx.npz`数据集文件。
6766

68-
运行命令后,产出`raw_data_ids.npz`文件。为了方便用户运行测试本模型,本项目提供了处理好的300M的训练样本:
69-
67+
为了方便用户运行测试本模型,本项目提供了处理好的300M的训练样本:
7068
```shell
71-
wget https://paddlenlp.bj.bcebos.com/models/transformers/gpt/train.data.json_ids.npz
69+
wget https://paddlenlp.bj.bcebos.com/models/transformers/gpt/data/gpt_en_dataset_300m_ids.npy
70+
wget https://paddlenlp.bj.bcebos.com/models/transformers/gpt/data/gpt_en_dataset_300m_idx.npz
7271
```
7372

74-
将所有预处理得到的npz文件统一放入一个文件夹中,以备训练使用:
73+
将所有预处理得到的文件统一放入一个文件夹中,以备训练使用:
7574

7675
```
7776
mkdir data
78-
mv train.data.json_ids.npz data
77+
mv gpt_en_dataset_300m_ids.npy ./data
78+
mv gpt_en_dataset_300m_idx.npz ./data
7979
```
8080

8181
### 模型训练

examples/language_model/gpt/args.py

Lines changed: 1 addition & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -129,17 +129,11 @@ def parse_args(MODEL_CLASSES):
129129
help="Epsilon for Adam optimizer.")
130130

131131
# Training steps config
132-
parser.add_argument(
133-
"--num_train_epochs",
134-
default=1,
135-
type=int,
136-
help="Total number of training epochs to perform.", )
137132
parser.add_argument(
138133
"--max_steps",
139134
default=500000,
140135
type=int,
141-
help="If > 0: set total number of training steps to perform. Override num_train_epochs."
142-
)
136+
help="set total number of training steps to perform.")
143137
parser.add_argument(
144138
"--save_steps",
145139
type=int,

examples/language_model/gpt/create_pretraining_data.py

Lines changed: 0 additions & 91 deletions
This file was deleted.

0 commit comments

Comments
 (0)