21
21
import six
22
22
import signal
23
23
import subprocess
24
- import six
24
+ import argparse
25
25
26
26
27
27
class TestDistRunnerBase (object ):
@@ -43,40 +43,35 @@ def get_transpiler(self, trainer_id, main_program, pserver_endpoints,
43
43
sync_mode = sync_mode )
44
44
return t
45
45
46
- def run_pserver (self ,
47
- pserver_endpoints ,
48
- trainers ,
49
- current_endpoint ,
50
- trainer_id ,
51
- sync_mode = True ):
46
+ def run_pserver (self , args ):
52
47
import paddle
53
48
import paddle .fluid as fluid
54
49
self .get_model (batch_size = 2 )
55
- t = self .get_transpiler (trainer_id ,
56
- fluid .default_main_program (), pserver_endpoints ,
57
- trainers , sync_mode )
58
- pserver_prog = t .get_pserver_program (current_endpoint )
59
- startup_prog = t .get_startup_program (current_endpoint , pserver_prog )
50
+ if args .mem_opt :
51
+ fluid .memory_optimize (fluid .default_main_program ())
52
+ t = self .get_transpiler (args .trainer_id ,
53
+ fluid .default_main_program (), args .endpoints ,
54
+ args .trainers , args .sync_mode )
55
+ pserver_prog = t .get_pserver_program (args .current_endpoint )
56
+ startup_prog = t .get_startup_program (args .current_endpoint ,
57
+ pserver_prog )
60
58
place = fluid .CPUPlace ()
61
59
exe = fluid .Executor (place )
62
60
exe .run (startup_prog )
63
61
exe .run (pserver_prog )
64
62
65
- def run_trainer (self ,
66
- place ,
67
- endpoints ,
68
- trainer_id ,
69
- trainers ,
70
- is_dist = True ,
71
- sync_mode = True ):
63
+ def run_trainer (self , place , args ):
72
64
import paddle
73
65
import paddle .fluid as fluid
74
66
test_program , avg_cost , train_reader , test_reader , batch_acc , predict = \
75
- self .get_model (batch_size = 2 )
76
- if is_dist :
77
- t = self .get_transpiler (trainer_id ,
78
- fluid .default_main_program (), endpoints ,
79
- trainers , sync_mode )
67
+ self .get_model (batch_size = 2 )
68
+ if args .mem_opt :
69
+ fluid .memory_optimize (fluid .default_main_program ())
70
+ if args .is_dist :
71
+ t = self .get_transpiler (args .trainer_id ,
72
+ fluid .default_main_program (),
73
+ args .endpoints , args .trainers ,
74
+ args .sync_mode )
80
75
trainer_prog = t .get_trainer_program ()
81
76
else :
82
77
trainer_prog = fluid .default_main_program ()
@@ -117,27 +112,27 @@ def runtime_main(test_class):
117
112
import paddle .fluid as fluid
118
113
import paddle .fluid .core as core
119
114
120
- if len (sys .argv ) != 8 :
121
- print (
122
- "Usage: python dist_se_resnext.py [pserver/trainer] [endpoints] [trainer_id] [current_endpoint] [trainers] [is_dist] [sync_mode]"
123
- )
124
- role = sys .argv [1 ]
125
- endpoints = sys .argv [2 ]
126
- trainer_id = int (sys .argv [3 ])
127
- current_endpoint = sys .argv [4 ]
128
- trainers = int (sys .argv [5 ])
129
- is_dist = True if sys .argv [6 ] == "TRUE" else False
130
- sync_mode = True if sys .argv [7 ] == "TRUE" else False
115
+ parser = argparse .ArgumentParser (description = 'Run dist test.' )
116
+ parser .add_argument (
117
+ '--role' , type = str , required = True , choices = ['pserver' , 'trainer' ])
118
+ parser .add_argument ('--endpoints' , type = str , required = False , default = "" )
119
+ parser .add_argument ('--is_dist' , action = 'store_true' )
120
+ parser .add_argument ('--trainer_id' , type = int , required = False , default = 0 )
121
+ parser .add_argument ('--trainers' , type = int , required = False , default = 1 )
122
+ parser .add_argument (
123
+ '--current_endpoint' , type = str , required = False , default = "" )
124
+ parser .add_argument ('--sync_mode' , action = 'store_true' )
125
+ parser .add_argument ('--mem_opt' , action = 'store_true' )
126
+
127
+ args = parser .parse_args ()
131
128
132
129
model = test_class ()
133
- if role == "pserver" :
134
- model .run_pserver (endpoints , trainers , current_endpoint , trainer_id ,
135
- sync_mode )
130
+ if args .role == "pserver" and args .is_dist :
131
+ model .run_pserver (args )
136
132
else :
137
133
p = fluid .CUDAPlace (0 ) if core .is_compiled_with_cuda (
138
134
) else fluid .CPUPlace ()
139
- model .run_trainer (p , endpoints , trainer_id , trainers , is_dist ,
140
- sync_mode )
135
+ model .run_trainer (p , args )
141
136
142
137
143
138
import paddle .compat as cpt
@@ -153,30 +148,34 @@ def setUp(self):
153
148
self ._ps_endpoints = "127.0.0.1:9123,127.0.0.1:9124"
154
149
self ._python_interp = "python"
155
150
self ._sync_mode = True
151
+ self ._mem_opt = False
156
152
self ._setup_config ()
157
153
158
154
def start_pserver (self , model_file , check_error_log ):
159
- sync_mode_str = "TRUE" if self . _sync_mode else "FALSE"
155
+
160
156
ps0_ep , ps1_ep = self ._ps_endpoints .split ("," )
161
- ps0_cmd = "%s %s pserver %s 0 %s %d TRUE %s" % \
157
+ ps_cmd = "%s %s --role pserver --endpoints %s --trainer_id 0 --current_endpoint %s --trainers %d --is_dist %s %s"
158
+ sync_mode_str = "--sync_mode" if self ._sync_mode else ""
159
+ mem_opt_str = "--mem_opt" if self ._mem_opt else ""
160
+ ps0_cmd = ps_cmd % \
162
161
(self ._python_interp , model_file , self ._ps_endpoints , ps0_ep ,
163
- self ._trainers , sync_mode_str )
164
- ps1_cmd = "%s %s pserver %s 0 %s %d TRUE %s" % \
162
+ self ._trainers , sync_mode_str , mem_opt_str )
163
+ ps1_cmd = ps_cmd % \
165
164
(self ._python_interp , model_file , self ._ps_endpoints , ps1_ep ,
166
- self ._trainers , sync_mode_str )
165
+ self ._trainers , sync_mode_str , mem_opt_str )
167
166
168
167
ps0_pipe = subprocess .PIPE
169
168
ps1_pipe = subprocess .PIPE
170
169
if check_error_log :
171
- print ("ps0_cmd:" , ps0_cmd )
172
- print ("ps1_cmd:" , ps1_cmd )
170
+ print (ps0_cmd )
171
+ print (ps1_cmd )
173
172
ps0_pipe = open ("/tmp/ps0_err.log" , "wb" )
174
173
ps1_pipe = open ("/tmp/ps1_err.log" , "wb" )
175
174
176
175
ps0_proc = subprocess .Popen (
177
- ps0_cmd .split (" " ), stdout = subprocess .PIPE , stderr = ps0_pipe )
176
+ ps0_cmd .strip (). split (" " ), stdout = subprocess .PIPE , stderr = ps0_pipe )
178
177
ps1_proc = subprocess .Popen (
179
- ps1_cmd .split (" " ), stdout = subprocess .PIPE , stderr = ps1_pipe )
178
+ ps1_cmd .strip (). split (" " ), stdout = subprocess .PIPE , stderr = ps1_pipe )
180
179
181
180
if not check_error_log :
182
181
return ps0_proc , ps1_proc , None , None
@@ -199,7 +198,7 @@ def _wait_ps_ready(self, pid):
199
198
retry_times -= 1
200
199
201
200
def check_with_place (self , model_file , delta = 1e-3 , check_error_log = False ):
202
- # *ATTENTION* THIS TEST NEEDS AT LEAST 2GPUS TO RUN
201
+ # TODO(typhoonzero): should auto adapt GPU count on the machine.
203
202
required_envs = {
204
203
"PATH" : os .getenv ("PATH" ),
205
204
"PYTHONPATH" : os .getenv ("PYTHONPATH" ),
@@ -215,18 +214,14 @@ def check_with_place(self, model_file, delta=1e-3, check_error_log=False):
215
214
# Run local to get a base line
216
215
env_local = {"CUDA_VISIBLE_DEVICES" : "0" }
217
216
env_local .update (required_envs )
218
- sync_mode_str = "TRUE" if self ._sync_mode else "FALSE"
219
- local_cmd = "%s %s trainer %s 0 %s %d FLASE %s" % \
220
- (self ._python_interp , model_file ,
221
- "127.0.0.1:1234" , "127.0.0.1:1234" , 1 , sync_mode_str )
217
+ local_cmd = "%s %s --role trainer" % (self ._python_interp , model_file )
222
218
if not check_error_log :
223
219
local_proc = subprocess .Popen (
224
220
local_cmd .split (" " ),
225
221
stdout = subprocess .PIPE ,
226
222
stderr = subprocess .PIPE ,
227
223
env = env_local )
228
224
else :
229
- print ("trainer cmd:" , local_cmd )
230
225
err_log = open ("/tmp/trainer.err.log" , "wb" )
231
226
local_proc = subprocess .Popen (
232
227
local_cmd .split (" " ),
@@ -247,12 +242,17 @@ def check_with_place(self, model_file, delta=1e-3, check_error_log=False):
247
242
self ._wait_ps_ready (ps1 .pid )
248
243
249
244
ps0_ep , ps1_ep = self ._ps_endpoints .split ("," )
250
- tr0_cmd = "%s %s trainer %s 0 %s %d TRUE %s" % \
251
- (self ._python_interp , model_file , self ._ps_endpoints , ps0_ep ,
252
- self ._trainers , sync_mode_str )
253
- tr1_cmd = "%s %s trainer %s 1 %s %d TRUE %s" % \
254
- (self ._python_interp , model_file , self ._ps_endpoints , ps1_ep ,
255
- self ._trainers , sync_mode_str )
245
+ tr_cmd = "%s %s --role trainer --endpoints %s --trainer_id %d --current_endpoint %s --trainers %d --is_dist %s %s"
246
+ sync_mode_str = "--sync_mode" if self ._sync_mode else ""
247
+ mem_opt_str = "--mem_opt" if self ._mem_opt else ""
248
+ tr0_cmd = tr_cmd % \
249
+ (self ._python_interp , model_file , self ._ps_endpoints ,
250
+ 0 , ps0_ep ,
251
+ self ._trainers , sync_mode_str , mem_opt_str )
252
+ tr1_cmd = tr_cmd % \
253
+ (self ._python_interp , model_file , self ._ps_endpoints ,
254
+ 1 , ps1_ep ,
255
+ self ._trainers , sync_mode_str , mem_opt_str )
256
256
257
257
env0 = {"CUDA_VISIBLE_DEVICES" : "0" }
258
258
env1 = {"CUDA_VISIBLE_DEVICES" : "1" }
@@ -269,12 +269,12 @@ def check_with_place(self, model_file, delta=1e-3, check_error_log=False):
269
269
tr1_pipe = open ("/tmp/tr1_err.log" , "wb" )
270
270
271
271
tr0_proc = subprocess .Popen (
272
- tr0_cmd .split (" " ),
272
+ tr0_cmd .strip (). split (" " ),
273
273
stdout = subprocess .PIPE ,
274
274
stderr = tr0_pipe ,
275
275
env = env0 )
276
276
tr1_proc = subprocess .Popen (
277
- tr1_cmd .split (" " ),
277
+ tr1_cmd .strip (). split (" " ),
278
278
stdout = subprocess .PIPE ,
279
279
stderr = tr1_pipe ,
280
280
env = env1 )
@@ -303,6 +303,8 @@ def check_with_place(self, model_file, delta=1e-3, check_error_log=False):
303
303
# FIXME: use terminate() instead of sigkill.
304
304
os .kill (ps0 .pid , signal .SIGKILL )
305
305
os .kill (ps1 .pid , signal .SIGKILL )
306
+ ps0 .wait ()
307
+ ps1 .wait ()
306
308
FNULL .close ()
307
309
308
310
self .assertAlmostEqual (local_first_loss , dist_first_loss , delta = delta )
0 commit comments