Skip to content

Commit de130e9

Browse files
wangxicodinggongweibao
authored andcommitted
[Cherry-pick 1.6] Print the rank of trainer & remove nccl sync in launch.py (#20937)
1 parent 3db61dc commit de130e9

File tree

1 file changed

+16
-14
lines changed

1 file changed

+16
-14
lines changed

python/paddle/distributed/launch.py

Lines changed: 16 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -51,7 +51,7 @@
5151
logger.setLevel(logging.INFO)
5252
log_handler = logging.StreamHandler()
5353
log_format = logging.Formatter(
54-
'%(asctime)s - %(filename)s:%(lineno)d - %(levelname)s: %(message)s')
54+
'%(levelname)s %(asctime)s %(filename)s:%(lineno)d] %(message)s')
5555
log_handler.setFormatter(log_format)
5656
logger.addHandler(log_handler)
5757

@@ -71,7 +71,7 @@ def _parse_args():
7171
parser = ArgumentParser(
7272
description='''start paddle training using multi-process mode.
7373
NOTE: your train program ***must*** run as distributed nccl2 mode,
74-
see: http://www.paddlepaddle.org/documentation/docs/zh/1.2/user_guides/howto/training/cluster_howto.html#permalink-8--nccl2-
74+
see: http://www.paddlepaddle.org/documentation/docs/zh/1.6/user_guides/howto/training/cluster_howto.html#permalink-8--nccl2-
7575
And your train program must read environment variables below in order to let different
7676
process init properly:
7777
FLAGS_selected_gpus
@@ -147,9 +147,6 @@ def terminate_procs(procs):
147147
def start_procs(args):
148148
"""
149149
"""
150-
procs = []
151-
log_fns = []
152-
153150
default_env = os.environ.copy()
154151

155152
current_node_ip = args.node_ip
@@ -213,48 +210,49 @@ def start_procs(args):
213210
current_env.pop("https_proxy", None)
214211

215212
procs = []
213+
log_fns = []
216214
cmds = []
215+
ranks = []
217216
for i in range(0, selected_gpus_num):
217+
rank = (node_id * selected_gpus_num + i)
218218
current_env.update({
219219
"FLAGS_selected_gpus": "%s" % selected_gpus[i],
220-
"PADDLE_TRAINER_ID": "%d" % (node_id * selected_gpus_num + i),
220+
"PADDLE_TRAINER_ID": "%d" % rank,
221221
"PADDLE_CURRENT_ENDPOINT":
222222
"%s:%d" % (current_node_ip, args.started_port + i),
223223
"PADDLE_TRAINERS_NUM": "%d" % nranks,
224224
"PADDLE_TRAINER_ENDPOINTS": trainers_endpoints
225225
})
226226

227-
if num_nodes > 1:
228-
current_env.update({"FLAGS_sync_nccl_allreduce": "0"})
229-
230227
cmd = [sys.executable, "-u", args.training_script
231228
] + args.training_script_args
232-
233229
cmds.append(cmd)
234230

235231
if args.log_dir is not None:
236232
os.system("mkdir -p {}".format(args.log_dir))
237233
fn = open("%s/workerlog.%d" % (args.log_dir, i), "w")
238234
log_fns.append(fn)
239-
240235
proc = subprocess.Popen(cmd, env=current_env, stdout=fn, stderr=fn)
241236
else:
242237
proc = subprocess.Popen(cmd, env=current_env)
243238

244239
procs.append(proc)
240+
ranks.append(rank)
245241

246242
try:
247243
alive = True
248244
error = False
245+
error_rank = []
249246
# wait all process finish or one error
250247
while alive and not error:
251248
alive = False
252-
for p in procs:
249+
for rank, p in zip(ranks, procs):
253250
ret = p.poll()
254251
if ret is None:
255252
alive = True
256253
elif ret != 0:
257254
error = True
255+
error_rank.append(rank)
258256
time.sleep(1)
259257

260258
if error:
@@ -266,11 +264,15 @@ def start_procs(args):
266264
terminate_procs(procs)
267265
raise
268266
except SystemExit:
269-
logger.error("One trainer process abort, exit")
267+
logger.error(
268+
"ABORT!!! Out of all {} trainers, the trainer process with rank={} was aborted. Please check its log.".
269+
format(nranks, error_rank))
270270
terminate_procs(procs)
271271
raise
272272
except:
273-
logger.error("Trainer process abort, exit")
273+
logger.error(
274+
"ABORT!!! Out of all {} trainers, the trainer process with rank={} was aborted. Please check its log.".
275+
format(nranks, error_rank))
274276
terminate_procs(procs)
275277
raise
276278
finally:

0 commit comments

Comments
 (0)