Skip to content

Commit e0bb8cc

Browse files
committed
fix conflict
2 parents f79a3a8 + e61d724 commit e0bb8cc

File tree

3 files changed

+196
-0
lines changed

3 files changed

+196
-0
lines changed

python/paddle/distributed/__init__.py

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,13 @@
1+
# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.

python/paddle/distributed/launch.py

Lines changed: 135 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,135 @@
1+
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
15+
from __future__ import print_function
16+
17+
import subprocess
18+
import os
19+
import sys
20+
import time
21+
import argparse
22+
23+
default_envs = {
24+
"PADDLE_TRAINER_ENDPOINTS":
25+
"127.0.0.1:6170,127.0.0.1:6171,127.0.0.1:6172,127.0.0.1:6173,127.0.0.1:6174,127.0.0.1:6175,127.0.0.1:6176,127.0.0.1:6177",
26+
"LD_LIBRARY_PATH": os.getenv("LD_LIBRARY_PATH", ""),
27+
"PATH": os.getenv("PATH"),
28+
"LD_PRELOAD": os.getenv("LD_PRELOAD", ""),
29+
"PADDLE_TRAINERS_NUM": "8",
30+
"NCCL_DEBUG": "INFO",
31+
"GLOG_v": "0",
32+
"NCCL_SOCKET_IFNAME": "eth0",
33+
"NCCL_IB_GID_INDEX": "3",
34+
"NCCL_IB_RETRY_CNT": "0",
35+
}
36+
37+
GPUS = 8
38+
39+
40+
def start_procs(gpus, entrypoint, entrypoint_args, log_dir):
41+
procs = []
42+
log_fns = []
43+
os.system("mkdir -p %s" % log_dir)
44+
# ======== update parent envs =======
45+
for k, v in os.environ.items():
46+
if k.startswith("FLAGS_") or k.startswith("NCCL_") or \
47+
k.startswith("GLOG_"):
48+
default_envs[k] = v
49+
50+
# ======== for dist training =======
51+
node_trainer_id = int(os.getenv("PADDLE_TRAINER_ID", "0"))
52+
current_ip = os.getenv("POD_IP", "127.0.0.1")
53+
trainer_ips = os.getenv("PADDLE_TRAINERS", current_ip).split(",")
54+
num_nodes = len(trainer_ips)
55+
all_nodes_devices_endpoints = ""
56+
for n in trainer_ips:
57+
for i in range(gpus):
58+
if all_nodes_devices_endpoints:
59+
all_nodes_devices_endpoints += ","
60+
all_nodes_devices_endpoints += "%s:617%d" % (n, i)
61+
nranks = num_nodes * gpus
62+
# ======== for dist training =======
63+
64+
for i in range(gpus):
65+
curr_env = {}
66+
curr_env.update(default_envs)
67+
curr_env.update({
68+
"FLAGS_selected_gpus": "%d" % i,
69+
"PADDLE_TRAINER_ID": "%d" % (node_trainer_id * gpus + i),
70+
"PADDLE_CURRENT_ENDPOINT": "%s:617%d" % (current_ip, i),
71+
# nranks
72+
"PADDLE_TRAINERS_NUM": "%d" % nranks,
73+
"PADDLE_TRAINER_ENDPOINTS": all_nodes_devices_endpoints
74+
})
75+
76+
print("starting process ", i, entrypoint, entrypoint_args, curr_env)
77+
fn = open("%s/workerlog.%d" % (log_dir, i), "w")
78+
log_fns.append(fn)
79+
cmd = [sys.executable, "-u", entrypoint] + entrypoint_args
80+
procs.append(subprocess.Popen(cmd, stdout=fn, stderr=fn, env=curr_env))
81+
82+
for i in range(gpus):
83+
try:
84+
procs[i].communicate()
85+
procs[i].terminate()
86+
log_fns[i].close()
87+
except:
88+
pass
89+
90+
91+
def parse_args():
92+
93+
parser = argparse.ArgumentParser(
94+
description='''start paddle training using multi-process mode.
95+
NOTE: your train program ***must*** run as distributed nccl2 mode,
96+
see: http://www.paddlepaddle.org/documentation/docs/zh/1.2/user_guides/howto/training/cluster_howto.html#permalink-8--nccl2-
97+
And your train program must read environment variables below in order to let different
98+
process init properly:
99+
FLAGS_selected_gpus
100+
PADDLE_TRAINER_ID
101+
PADDLE_CURRENT_ENDPOINT
102+
PADDLE_TRAINERS_NUM
103+
PADDLE_TRAINER_ENDPOINTS
104+
POD_IP (current node ip address, not needed for local training)
105+
''')
106+
parser.add_argument(
107+
'--gpus',
108+
type=int,
109+
default=8,
110+
help='start number of processes for every gpu')
111+
parser.add_argument(
112+
'--log_dir',
113+
type=str,
114+
default="mylog",
115+
help='directory to put logs per process.')
116+
parser.add_argument(
117+
'entrypoint_script',
118+
type=str,
119+
help="The entrypoint script to be launched in parallel,"
120+
"followed by all the arguments for each process,"
121+
"e.g. train.py --lr 0.1")
122+
parser.add_argument('entrypoint_args', nargs=argparse.REMAINDER)
123+
return parser.parse_args()
124+
125+
126+
def main():
127+
args = parse_args()
128+
129+
# launch multiple training process
130+
start_procs(args.gpus, args.entrypoint_script, args.entrypoint_args,
131+
args.log_dir)
132+
133+
134+
if __name__ == "__main__":
135+
main()
Lines changed: 48 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,48 @@
1+
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
15+
import os
16+
import unittest
17+
import paddle.fluid as fluid
18+
import paddle.fluid.core as core
19+
20+
os.environ['FLAGS_eager_delete_tensor_gb'] = "0.0"
21+
os.environ[
22+
'RECORDIO_FILENAME'] = '/tmp/ir_memory_optimize_transformer.wmt16.recordio'
23+
24+
from test_parallel_executor_transformer import TestTransformer
25+
from test_parallel_executor_transformer import transformer
26+
27+
28+
# NOTE(dzhwinter): test diferent strategy colisions.
29+
# open the eager delete tensor strategy by default.
30+
class TestTransformerWithIR(TestTransformer):
31+
def test_main(self):
32+
if core.is_compiled_with_cuda():
33+
# check python transpiler
34+
self.check_network_convergence(
35+
transformer,
36+
use_cuda=True,
37+
memory_opt=True,
38+
use_ir_memory_optimize=False)
39+
# check IR memory optimize
40+
self.check_network_convergence(
41+
transformer,
42+
use_cuda=True,
43+
memory_opt=False,
44+
use_ir_memory_optimize=True)
45+
46+
47+
if __name__ == '__main__':
48+
unittest.main()

0 commit comments

Comments
 (0)