Skip to content

Commit 1d0ba1a

Browse files
committed
support train naml on kunlun
1 parent 3d49e2c commit 1d0ba1a

File tree

6 files changed

+116
-15
lines changed

6 files changed

+116
-15
lines changed
Lines changed: 48 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,48 @@
1+
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
15+
runner:
16+
train_data_dir: "../../../datasets/MIND/data/train"
17+
train_reader_path: "NAMLDataReader" # importlib format
18+
use_gpu: False
19+
use_xpu: True
20+
train_batch_size: 50
21+
epochs: 2
22+
print_interval: 10
23+
#model_init_path: "output_model/0" # init model
24+
model_save_path: "output_model_all"
25+
infer_batch_size: 10
26+
infer_reader_path: "NAMLDataReader" # importlib format
27+
test_data_dir: "../../../datasets/MIND/data/test"
28+
infer_load_path: "output_model_all"
29+
infer_start_epoch: 1
30+
infer_end_epoch: 2
31+
32+
# hyper parameters of user-defined network
33+
hyper_parameters:
34+
# optimizer config
35+
optimizer:
36+
class: Adam
37+
learning_rate: 0.001
38+
strategy: async
39+
# user-defined <key, value> pairs
40+
article_content_size: 100
41+
article_title_size: 10
42+
browse_size: 30
43+
neg_condidate_sample_size: 4
44+
word_dimension: 60
45+
category_dimension: 32
46+
category_size: 19
47+
sub_category_size: 294
48+
word_dict_size: 31440
Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,33 @@
1+
# How to train naml on kunlun
2+
3+
## Prepare kunlun environment
4+
[Paddle installation for machines with Kunlun XPU card](https://www.paddlepaddle.org.cn/install/quick?docurl=/documentation/docs/zh/2.0-rc1/install/install_Kunlun_zh.html)
5+
6+
## Prepare data
7+
```shell
8+
cd PaddleRec/datasets/MIND/data
9+
bash run.sh
10+
```
11+
12+
## Train
13+
```shell
14+
# set kunlun card id
15+
export FLAGS_selected_xpus=0
16+
# enable convolution autotune
17+
export XPU_CONV_AUTOTUNE=2
18+
19+
cd PaddleRec/models/rank/naml
20+
python3.7 -u ../../../tools/trainer.py -m config_bigdata_kunlun.yaml
21+
```
22+
23+
24+
## Eval
25+
```shell
26+
# set kunlun card id
27+
export FLAGS_selected_xpus=0
28+
# enable convolution autotune
29+
export XPU_CONV_AUTOTUNE=2
30+
31+
cd PaddleRec/models/rank/naml
32+
python3.7 -u ../../../tools/infer.py -m config_bigdata_kunlun.yaml
33+
```

tools/infer.py

Lines changed: 8 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -63,6 +63,7 @@ def main(args):
6363

6464
# tools.vars
6565
use_gpu = config.get("runner.use_gpu", True)
66+
use_xpu = config.get("runner.use_xpu", False)
6667
use_visual = config.get("runner.use_visual", False)
6768
test_data_dir = config.get("runner.test_data_dir", None)
6869
print_interval = config.get("runner.print_interval", None)
@@ -73,12 +74,16 @@ def main(args):
7374

7475
logger.info("**************common.configs**********")
7576
logger.info(
76-
"use_gpu: {}, use_visual: {}, infer_batch_size: {}, test_data_dir: {}, start_epoch: {}, end_epoch: {}, print_interval: {}, model_load_path: {}".
77-
format(use_gpu, use_visual, infer_batch_size, test_data_dir,
77+
"use_gpu: {}, use_xpu: {}, use_visual: {}, infer_batch_size: {}, test_data_dir: {}, start_epoch: {}, end_epoch: {}, print_interval: {}, model_load_path: {}".
78+
format(use_gpu, use_xpu, use_visual, infer_batch_size, test_data_dir,
7879
start_epoch, end_epoch, print_interval, model_load_path))
7980
logger.info("**************common.configs**********")
8081

81-
place = paddle.set_device('gpu' if use_gpu else 'cpu')
82+
if use_xpu:
83+
xpu_device = 'xpu:{0}'.format(os.getenv('FLAGS_selected_xpus', 0))
84+
place = paddle.set_device(xpu_device)
85+
else:
86+
place = paddle.set_device('gpu' if use_gpu else 'cpu')
8287

8388
dy_model = dy_model_class.create_model(config)
8489

tools/static_infer.py

Lines changed: 9 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -68,6 +68,7 @@ def main(args):
6868
logger.info("cpu_num: {}".format(os.getenv("CPU_NUM")))
6969

7070
use_gpu = config.get("runner.use_gpu", True)
71+
use_xpu = config.get("runner.use_xpu", False)
7172
use_auc = config.get("runner.use_auc", False)
7273
use_visual = config.get("runner.use_visual", False)
7374
auc_num = config.get("runner.auc_num", 1)
@@ -80,12 +81,16 @@ def main(args):
8081
os.environ["CPU_NUM"] = str(config.get("runner.thread_num", 1))
8182
logger.info("**************common.configs**********")
8283
logger.info(
83-
"use_gpu: {}, use_visual: {}, infer_batch_size: {}, test_data_dir: {}, start_epoch: {}, end_epoch: {}, print_interval: {}, model_load_path: {}".
84-
format(use_gpu, use_visual, batch_size, test_data_dir, start_epoch,
85-
end_epoch, print_interval, model_load_path))
84+
"use_gpu: {}, use_xpu: {}, use_visual: {}, infer_batch_size: {}, test_data_dir: {}, start_epoch: {}, end_epoch: {}, print_interval: {}, model_load_path: {}".
85+
format(use_gpu, use_xpu, use_visual, batch_size, test_data_dir,
86+
start_epoch, end_epoch, print_interval, model_load_path))
8687
logger.info("**************common.configs**********")
8788

88-
place = paddle.set_device('gpu' if use_gpu else 'cpu')
89+
if use_xpu:
90+
xpu_device = 'xpu:{0}'.format(os.getenv('FLAGS_selected_xpus', 0))
91+
place = paddle.set_device(xpu_device)
92+
else:
93+
place = paddle.set_device('gpu' if use_gpu else 'cpu')
8994
exe = paddle.static.Executor(place)
9095
# initialize
9196
exe.run(paddle.static.default_startup_program())

tools/static_trainer.py

Lines changed: 9 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -72,6 +72,7 @@ def main(args):
7272
logger.info("cpu_num: {}".format(os.getenv("CPU_NUM")))
7373

7474
use_gpu = config.get("runner.use_gpu", True)
75+
use_xpu = config.get("runner.use_xpu", False)
7576
use_auc = config.get("runner.use_auc", False)
7677
use_visual = config.get("runner.use_visual", False)
7778
use_inference = config.get("runner.use_inference", False)
@@ -87,12 +88,16 @@ def main(args):
8788
os.environ["CPU_NUM"] = str(config.get("runner.thread_num", 1))
8889
logger.info("**************common.configs**********")
8990
logger.info(
90-
"use_gpu: {}, use_visual: {}, train_batch_size: {}, train_data_dir: {}, epochs: {}, print_interval: {}, model_save_path: {}".
91-
format(use_gpu, use_visual, batch_size, train_data_dir, epochs,
92-
print_interval, model_save_path))
91+
"use_gpu: {}, use_xpu: {}, use_visual: {}, train_batch_size: {}, train_data_dir: {}, epochs: {}, print_interval: {}, model_save_path: {}".
92+
format(use_gpu, use_xpu, use_visual, batch_size, train_data_dir,
93+
epochs, print_interval, model_save_path))
9394
logger.info("**************common.configs**********")
9495

95-
place = paddle.set_device('gpu' if use_gpu else 'cpu')
96+
if use_xpu:
97+
xpu_device = 'xpu:{0}'.format(os.getenv('FLAGS_selected_xpus', 0))
98+
place = paddle.set_device(xpu_device)
99+
else:
100+
place = paddle.set_device('gpu' if use_gpu else 'cpu')
96101

97102
if use_fleet:
98103
from paddle.distributed import fleet

tools/trainer.py

Lines changed: 9 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -63,6 +63,7 @@ def main(args):
6363

6464
# tools.vars
6565
use_gpu = config.get("runner.use_gpu", True)
66+
use_xpu = config.get("runner.use_xpu", False)
6667
use_visual = config.get("runner.use_visual", False)
6768
train_data_dir = config.get("runner.train_data_dir", None)
6869
epochs = config.get("runner.epochs", None)
@@ -74,12 +75,16 @@ def main(args):
7475

7576
logger.info("**************common.configs**********")
7677
logger.info(
77-
"use_gpu: {}, use_visual: {}, train_batch_size: {}, train_data_dir: {}, epochs: {}, print_interval: {}, model_save_path: {}".
78-
format(use_gpu, use_visual, train_batch_size, train_data_dir, epochs,
79-
print_interval, model_save_path))
78+
"use_gpu: {}, use_xpu: {}, use_visual: {}, train_batch_size: {}, train_data_dir: {}, epochs: {}, print_interval: {}, model_save_path: {}".
79+
format(use_gpu, use_xpu, use_visual, train_batch_size, train_data_dir,
80+
epochs, print_interval, model_save_path))
8081
logger.info("**************common.configs**********")
8182

82-
place = paddle.set_device('gpu' if use_gpu else 'cpu')
83+
if use_xpu:
84+
xpu_device = 'xpu:{0}'.format(os.getenv('FLAGS_selected_xpus', 0))
85+
place = paddle.set_device(xpu_device)
86+
else:
87+
place = paddle.set_device('gpu' if use_gpu else 'cpu')
8388

8489
dy_model = dy_model_class.create_model(config)
8590

0 commit comments

Comments
 (0)