Skip to content

Commit 3ef7a1c

Browse files
authored
Merge pull request #413 from tangzhiyi11/tzy
support train naml on kunlun
2 parents 69800a3 + b09ef56 commit 3ef7a1c

File tree

7 files changed

+183
-15
lines changed

7 files changed

+183
-15
lines changed
Lines changed: 48 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,48 @@
1+
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
15+
runner:
16+
train_data_dir: "../../../datasets/MIND/data/train"
17+
train_reader_path: "NAMLDataReader" # importlib format
18+
use_gpu: False
19+
use_xpu: True
20+
train_batch_size: 50
21+
epochs: 2
22+
print_interval: 10
23+
#model_init_path: "output_model/0" # init model
24+
model_save_path: "output_model_all"
25+
infer_batch_size: 10
26+
infer_reader_path: "NAMLDataReader" # importlib format
27+
test_data_dir: "../../../datasets/MIND/data/test"
28+
infer_load_path: "output_model_all"
29+
infer_start_epoch: 1
30+
infer_end_epoch: 2
31+
32+
# hyper parameters of user-defined network
33+
hyper_parameters:
34+
# optimizer config
35+
optimizer:
36+
class: Adam
37+
learning_rate: 0.001
38+
strategy: async
39+
# user-defined <key, value> pairs
40+
article_content_size: 100
41+
article_title_size: 10
42+
browse_size: 30
43+
neg_condidate_sample_size: 4
44+
word_dimension: 60
45+
category_dimension: 32
46+
category_size: 19
47+
sub_category_size: 294
48+
word_dict_size: 31440
Lines changed: 48 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,48 @@
1+
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
15+
runner:
16+
train_data_dir: "data/sample_data/train"
17+
train_reader_path: "NAMLDataReader" # importlib format
18+
use_gpu: False
19+
use_xpu: True
20+
train_batch_size: 10
21+
epochs: 20
22+
print_interval: 2
23+
#model_init_path: "output_model/0" # init model
24+
model_save_path: "output_model_all"
25+
infer_batch_size: 2
26+
infer_reader_path: "NAMLDataReader" # importlib format
27+
test_data_dir: "data/sample_data/train"
28+
infer_load_path: "output_model_all"
29+
infer_start_epoch: 0
30+
infer_end_epoch: 3
31+
32+
# hyper parameters of user-defined network
33+
hyper_parameters:
34+
# optimizer config
35+
optimizer:
36+
class: Adam
37+
learning_rate: 0.001
38+
strategy: async
39+
# user-defined <key, value> pairs
40+
article_content_size: 30
41+
article_title_size: 10
42+
browse_size: 10
43+
neg_condidate_sample_size: 4
44+
word_dimension: 30
45+
category_size: 4
46+
sub_category_size: 10
47+
category_dimension: 32
48+
word_dict_size: 101
Lines changed: 52 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,52 @@
1+
# 使用昆仑XPU芯片加速NAML模型训练
2+
3+
## 准备Paddle昆仑XPU版训练环境
4+
[昆仑XPU芯片运行飞桨](https://www.paddlepaddle.org.cn/documentation/docs/zh/guides/xpu_docs/index_cn.html)
5+
6+
## 数据准备
7+
8+
### 示例数据
9+
参考 [数据准备](README##数据准备)
10+
11+
12+
### 全量数据
13+
```shell
14+
cd PaddleRec/datasets/MIND/data
15+
bash run.sh
16+
```
17+
18+
## 训练
19+
```shell
20+
# 设置训练使用的昆仑XPU芯片卡号
21+
export FLAGS_selected_xpus=0
22+
# 开启昆仑XPU芯片卷积计算加速(可不设置)
23+
export XPU_CONV_AUTOTUNE=2
24+
25+
cd PaddleRec/models/rank/naml
26+
# 全量数据静态图训练
27+
python3.7 -u ../../../tools/static_trainer.py -m config_bigdata_kunlun.yaml # 使用示例数据,请指定config_kunlun.yaml
28+
```
29+
30+
## 评估
31+
```shell
32+
# 设置训练使用的昆仑XPU芯片卡号
33+
export FLAGS_selected_xpus=0
34+
# 开启昆仑XPU芯片卷积计算加速(可不设置)
35+
export XPU_CONV_AUTOTUNE=2
36+
37+
cd PaddleRec/models/rank/naml
38+
# 全量数据静态图预测
39+
python3.7 -u ../../../tools/static_infer.py -m config_bigdata_kunlun.yaml # 使用示例数据,请指定config_kunlun.yaml
40+
```
41+
42+
## 模型效果
43+
以下为全量数据训练2个epoch的结果:
44+
45+
| 模型 | 训练auc |batch_size | epoch_num| Time of each epoch|
46+
| :------| :------ | :------ | :------| :------ |
47+
| naml | 0.71 | 50 | 2 | 约7小时 |
48+
49+
50+
| 模型 | 预测auc |batch_size | Time of each epoch|
51+
| :------| :------ | :------ | :------ |
52+
| naml | 0.67 | 10 | 约2小时 |

tools/infer.py

Lines changed: 8 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -63,6 +63,7 @@ def main(args):
6363

6464
# tools.vars
6565
use_gpu = config.get("runner.use_gpu", True)
66+
use_xpu = config.get("runner.use_xpu", False)
6667
use_visual = config.get("runner.use_visual", False)
6768
test_data_dir = config.get("runner.test_data_dir", None)
6869
print_interval = config.get("runner.print_interval", None)
@@ -73,12 +74,16 @@ def main(args):
7374

7475
logger.info("**************common.configs**********")
7576
logger.info(
76-
"use_gpu: {}, use_visual: {}, infer_batch_size: {}, test_data_dir: {}, start_epoch: {}, end_epoch: {}, print_interval: {}, model_load_path: {}".
77-
format(use_gpu, use_visual, infer_batch_size, test_data_dir,
77+
"use_gpu: {}, use_xpu: {}, use_visual: {}, infer_batch_size: {}, test_data_dir: {}, start_epoch: {}, end_epoch: {}, print_interval: {}, model_load_path: {}".
78+
format(use_gpu, use_xpu, use_visual, infer_batch_size, test_data_dir,
7879
start_epoch, end_epoch, print_interval, model_load_path))
7980
logger.info("**************common.configs**********")
8081

81-
place = paddle.set_device('gpu' if use_gpu else 'cpu')
82+
if use_xpu:
83+
xpu_device = 'xpu:{0}'.format(os.getenv('FLAGS_selected_xpus', 0))
84+
place = paddle.set_device(xpu_device)
85+
else:
86+
place = paddle.set_device('gpu' if use_gpu else 'cpu')
8287

8388
dy_model = dy_model_class.create_model(config)
8489

tools/static_infer.py

Lines changed: 9 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -68,6 +68,7 @@ def main(args):
6868
logger.info("cpu_num: {}".format(os.getenv("CPU_NUM")))
6969

7070
use_gpu = config.get("runner.use_gpu", True)
71+
use_xpu = config.get("runner.use_xpu", False)
7172
use_auc = config.get("runner.use_auc", False)
7273
use_visual = config.get("runner.use_visual", False)
7374
auc_num = config.get("runner.auc_num", 1)
@@ -80,12 +81,16 @@ def main(args):
8081
os.environ["CPU_NUM"] = str(config.get("runner.thread_num", 1))
8182
logger.info("**************common.configs**********")
8283
logger.info(
83-
"use_gpu: {}, use_visual: {}, infer_batch_size: {}, test_data_dir: {}, start_epoch: {}, end_epoch: {}, print_interval: {}, model_load_path: {}".
84-
format(use_gpu, use_visual, batch_size, test_data_dir, start_epoch,
85-
end_epoch, print_interval, model_load_path))
84+
"use_gpu: {}, use_xpu: {}, use_visual: {}, infer_batch_size: {}, test_data_dir: {}, start_epoch: {}, end_epoch: {}, print_interval: {}, model_load_path: {}".
85+
format(use_gpu, use_xpu, use_visual, batch_size, test_data_dir,
86+
start_epoch, end_epoch, print_interval, model_load_path))
8687
logger.info("**************common.configs**********")
8788

88-
place = paddle.set_device('gpu' if use_gpu else 'cpu')
89+
if use_xpu:
90+
xpu_device = 'xpu:{0}'.format(os.getenv('FLAGS_selected_xpus', 0))
91+
place = paddle.set_device(xpu_device)
92+
else:
93+
place = paddle.set_device('gpu' if use_gpu else 'cpu')
8994
exe = paddle.static.Executor(place)
9095
# initialize
9196
exe.run(paddle.static.default_startup_program())

tools/static_trainer.py

Lines changed: 9 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -72,6 +72,7 @@ def main(args):
7272
logger.info("cpu_num: {}".format(os.getenv("CPU_NUM")))
7373

7474
use_gpu = config.get("runner.use_gpu", True)
75+
use_xpu = config.get("runner.use_xpu", False)
7576
use_auc = config.get("runner.use_auc", False)
7677
use_visual = config.get("runner.use_visual", False)
7778
use_inference = config.get("runner.use_inference", False)
@@ -87,12 +88,16 @@ def main(args):
8788
os.environ["CPU_NUM"] = str(config.get("runner.thread_num", 1))
8889
logger.info("**************common.configs**********")
8990
logger.info(
90-
"use_gpu: {}, use_visual: {}, train_batch_size: {}, train_data_dir: {}, epochs: {}, print_interval: {}, model_save_path: {}".
91-
format(use_gpu, use_visual, batch_size, train_data_dir, epochs,
92-
print_interval, model_save_path))
91+
"use_gpu: {}, use_xpu: {}, use_visual: {}, train_batch_size: {}, train_data_dir: {}, epochs: {}, print_interval: {}, model_save_path: {}".
92+
format(use_gpu, use_xpu, use_visual, batch_size, train_data_dir,
93+
epochs, print_interval, model_save_path))
9394
logger.info("**************common.configs**********")
9495

95-
place = paddle.set_device('gpu' if use_gpu else 'cpu')
96+
if use_xpu:
97+
xpu_device = 'xpu:{0}'.format(os.getenv('FLAGS_selected_xpus', 0))
98+
place = paddle.set_device(xpu_device)
99+
else:
100+
place = paddle.set_device('gpu' if use_gpu else 'cpu')
96101

97102
if use_fleet:
98103
from paddle.distributed import fleet

tools/trainer.py

Lines changed: 9 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -63,6 +63,7 @@ def main(args):
6363

6464
# tools.vars
6565
use_gpu = config.get("runner.use_gpu", True)
66+
use_xpu = config.get("runner.use_xpu", False)
6667
use_visual = config.get("runner.use_visual", False)
6768
train_data_dir = config.get("runner.train_data_dir", None)
6869
epochs = config.get("runner.epochs", None)
@@ -74,12 +75,16 @@ def main(args):
7475

7576
logger.info("**************common.configs**********")
7677
logger.info(
77-
"use_gpu: {}, use_visual: {}, train_batch_size: {}, train_data_dir: {}, epochs: {}, print_interval: {}, model_save_path: {}".
78-
format(use_gpu, use_visual, train_batch_size, train_data_dir, epochs,
79-
print_interval, model_save_path))
78+
"use_gpu: {}, use_xpu: {}, use_visual: {}, train_batch_size: {}, train_data_dir: {}, epochs: {}, print_interval: {}, model_save_path: {}".
79+
format(use_gpu, use_xpu, use_visual, train_batch_size, train_data_dir,
80+
epochs, print_interval, model_save_path))
8081
logger.info("**************common.configs**********")
8182

82-
place = paddle.set_device('gpu' if use_gpu else 'cpu')
83+
if use_xpu:
84+
xpu_device = 'xpu:{0}'.format(os.getenv('FLAGS_selected_xpus', 0))
85+
place = paddle.set_device(xpu_device)
86+
else:
87+
place = paddle.set_device('gpu' if use_gpu else 'cpu')
8388

8489
dy_model = dy_model_class.create_model(config)
8590

0 commit comments

Comments
 (0)