Skip to content

Commit b35239d

Browse files
seiriosPlusgongweibao
authored andcommitted
fix dist ut with place, test=develop (#13647)
1 parent d6747a9 commit b35239d

File tree

6 files changed

+43
-48
lines changed

6 files changed

+43
-48
lines changed

python/paddle/fluid/tests/unittests/dist_se_resnext.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -247,7 +247,7 @@ def get_model(self, batch_size=2):
247247

248248
# Reader
249249
train_reader = paddle.batch(
250-
paddle.dataset.flowers.train(), batch_size=batch_size)
250+
paddle.dataset.flowers.test(use_xmap=False), batch_size=batch_size)
251251
test_reader = paddle.batch(
252252
paddle.dataset.flowers.test(use_xmap=False), batch_size=batch_size)
253253

python/paddle/fluid/tests/unittests/test_dist_base.py

Lines changed: 30 additions & 36 deletions
Original file line numberDiff line numberDiff line change
@@ -164,18 +164,30 @@ class TestDistBase(unittest.TestCase):
164164
def _setup_config(self):
165165
raise NotImplementedError("tests should have _setup_config implemented")
166166

167+
def _after_setup_config(self):
168+
if self._enforce_place == "CPU":
169+
self.__use_cuda = False
170+
elif self._enforce_place == "GPU":
171+
self.__use_cuda = True
172+
else:
173+
if fluid.core.is_compiled_with_cuda():
174+
self.__use_cuda = True
175+
else:
176+
self.__use_cuda = False
177+
167178
def setUp(self):
168179
self._trainers = 2
169180
self._pservers = 2
170181
self._ps_endpoints = "127.0.0.1:%s,127.0.0.1:%s" % (
171182
self._find_free_port(), self._find_free_port())
172183
self._python_interp = "python"
173184
self._sync_mode = True
174-
self._use_cuda = True
185+
self._enforce_place = None
175186
self._mem_opt = False
176187
self._use_reduce = False
177188
self._use_reader_alloc = True
178189
self._setup_config()
190+
self._after_setup_config()
179191

180192
def _find_free_port(self):
181193
with closing(socket.socket(socket.AF_INET, socket.SOCK_STREAM)) as s:
@@ -199,13 +211,10 @@ def start_pserver(self, model_file, check_error_log, required_envs):
199211
ps0_cmd += " --mem_opt"
200212
ps1_cmd += " --mem_opt"
201213

202-
ps0_pipe = subprocess.PIPE
203-
ps1_pipe = subprocess.PIPE
204-
if check_error_log:
205-
print(ps0_cmd)
206-
print(ps1_cmd)
207-
ps0_pipe = open("/tmp/ps0_err.log", "wb")
208-
ps1_pipe = open("/tmp/ps1_err.log", "wb")
214+
print(ps0_cmd)
215+
print(ps1_cmd)
216+
ps0_pipe = open("/tmp/ps0_err.log", "wb")
217+
ps1_pipe = open("/tmp/ps1_err.log", "wb")
209218

210219
ps0_proc = subprocess.Popen(
211220
ps0_cmd.strip().split(" "),
@@ -218,10 +227,7 @@ def start_pserver(self, model_file, check_error_log, required_envs):
218227
stderr=ps1_pipe,
219228
env=required_envs)
220229

221-
if not check_error_log:
222-
return ps0_proc, ps1_proc, None, None
223-
else:
224-
return ps0_proc, ps1_proc, ps0_pipe, ps1_pipe
230+
return ps0_proc, ps1_proc, ps0_pipe, ps1_pipe
225231

226232
def _wait_ps_ready(self, pid):
227233
retry_times = 50
@@ -242,15 +248,15 @@ def _run_local(self, model, envs, check_error_log):
242248

243249
cmd = "%s %s --role trainer" % (self._python_interp, model)
244250

245-
if self._use_cuda:
251+
if self.__use_cuda:
246252
cmd += " --use_cuda"
247253
env_local = {"CUDA_VISIBLE_DEVICES": "0"}
248254
else:
249255
env_local = {'CPU_NUM': '1'}
250256

251257
envs.update(env_local)
252258

253-
if not check_error_log:
259+
if check_error_log:
254260
err_log = open("/tmp/trainer.err.log", "wb")
255261
local_proc = subprocess.Popen(
256262
cmd.split(" "),
@@ -264,7 +270,6 @@ def _run_local(self, model, envs, check_error_log):
264270
stderr=subprocess.PIPE,
265271
env=envs)
266272

267-
local_proc.wait()
268273
local_out, local_err = local_proc.communicate()
269274
local_ret = cpt.to_text(local_out)
270275

@@ -305,7 +310,7 @@ def _run_cluster(self, model, envs, check_error_log):
305310
if self._use_reader_alloc:
306311
tr0_cmd += " --use_reader_alloc"
307312
tr1_cmd += " --use_reader_alloc"
308-
if self._use_cuda:
313+
if self.__use_cuda:
309314
tr0_cmd += " --use_cuda"
310315
tr1_cmd += " --use_cuda"
311316
env0 = {"CUDA_VISIBLE_DEVICES": "0"}
@@ -317,15 +322,10 @@ def _run_cluster(self, model, envs, check_error_log):
317322
env0.update(envs)
318323
env1.update(envs)
319324

320-
FNULL = open(os.devnull, 'w')
321-
322-
tr0_pipe = subprocess.PIPE
323-
tr1_pipe = subprocess.PIPE
324-
if check_error_log:
325-
print("tr0_cmd:{}, env0: {}".format(tr0_cmd, env0))
326-
print("tr1_cmd:{}, env1: {}".format(tr1_cmd, env1))
327-
tr0_pipe = open("/tmp/tr0_err.log", "wb")
328-
tr1_pipe = open("/tmp/tr1_err.log", "wb")
325+
print("tr0_cmd:{}, env0: {}".format(tr0_cmd, env0))
326+
print("tr1_cmd:{}, env1: {}".format(tr1_cmd, env1))
327+
tr0_pipe = open("/tmp/tr0_err.log", "wb")
328+
tr1_pipe = open("/tmp/tr1_err.log", "wb")
329329

330330
tr0_proc = subprocess.Popen(
331331
tr0_cmd.strip().split(" "),
@@ -338,29 +338,22 @@ def _run_cluster(self, model, envs, check_error_log):
338338
stderr=tr1_pipe,
339339
env=env1)
340340

341-
tr0_proc.wait()
342-
tr1_proc.wait()
343-
344341
tr0_out, tr0_err = tr0_proc.communicate()
345342
tr0_loss_text = cpt.to_text(tr0_out)
346343
tr1_out, tr1_err = tr1_proc.communicate()
347344
tr1_loss_text = cpt.to_text(tr1_out)
348345

349346
# close trainer file
350-
if check_error_log:
351-
tr0_pipe.close()
352-
tr1_pipe.close()
347+
tr0_pipe.close()
348+
tr1_pipe.close()
353349

354-
ps0_pipe.close()
355-
ps1_pipe.close()
350+
ps0_pipe.close()
351+
ps1_pipe.close()
356352
# FIXME: use terminate() instead of sigkill.
357353
os.kill(ps0.pid, signal.SIGKILL)
358354
os.kill(ps1.pid, signal.SIGKILL)
359355
ps0.terminate()
360356
ps1.terminate()
361-
ps0.wait()
362-
ps1.wait()
363-
FNULL.close()
364357

365358
# print log
366359
sys.stderr.write('trainer 0 stdout:\n %s\n' % tr0_loss_text)
@@ -385,6 +378,7 @@ def check_with_place(self,
385378
"LD_LIBRARY_PATH": os.getenv("LD_LIBRARY_PATH", ""),
386379
"FLAGS_fraction_of_gpu_memory_to_use": "0.15",
387380
"FLAGS_cudnn_deterministic": "1",
381+
"http_proxy": ""
388382
}
389383

390384
required_envs.update(need_envs)

python/paddle/fluid/tests/unittests/test_dist_ctr.py

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -21,10 +21,11 @@
2121
class TestDistCTR2x2(TestDistBase):
2222
def _setup_config(self):
2323
self._sync_mode = True
24-
self._use_cuda = False
24+
self._enforce_place = "CPU"
2525

26-
def test_dist_ctr(self):
27-
self.check_with_place("dist_ctr.py", delta=1e-7, check_error_log=False)
26+
27+
def test_dist_ctr(self):
28+
self.check_with_place("dist_ctr.py", delta=1e-7, check_error_log=False)
2829

2930

3031
if __name__ == "__main__":

python/paddle/fluid/tests/unittests/test_dist_se_resnext.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,7 @@ def _setup_config(self):
2222
self._sync_mode = True
2323
self._use_reader_alloc = False
2424

25-
def no_test_dist_train(self):
25+
def test_dist_train(self):
2626
self.check_with_place("dist_se_resnext.py", delta=100)
2727

2828

@@ -40,7 +40,7 @@ def _setup_config(self):
4040
self._sync_mode = False
4141
self._use_reader_alloc = False
4242

43-
def no_test_dist_train(self):
43+
def test_dist_train(self):
4444
self.check_with_place("dist_se_resnext.py", delta=100)
4545

4646

python/paddle/fluid/tests/unittests/test_dist_simnet_bow.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,7 @@
2222
class TestDistSimnetBowDense2x2(TestDistBase):
2323
def _setup_config(self):
2424
self._sync_mode = True
25-
self._use_cuda = False
25+
self._enforce_place = "CPU"
2626

2727
def test_simnet_bow(self):
2828
need_envs = {"IS_DISTRIBUTED": '0', "IS_SPARSE": '0'}
@@ -36,7 +36,7 @@ def test_simnet_bow(self):
3636
class TestDistSimnetBow2x2DenseAsync(TestDistBase):
3737
def _setup_config(self):
3838
self._sync_mode = False
39-
self._use_cuda = False
39+
self._enforce_place = "CPU"
4040

4141
def test_simnet_bow(self):
4242
need_envs = {"IS_DISTRIBUTED": '0', "IS_SPARSE": '0'}
@@ -50,7 +50,7 @@ def test_simnet_bow(self):
5050
class TestDistSimnetBowSparse2x2(TestDistBase):
5151
def _setup_config(self):
5252
self._sync_mode = True
53-
self._use_cuda = False
53+
self._enforce_place = "CPU"
5454

5555
def test_simnet_bow(self):
5656
need_envs = {"IS_DISTRIBUTED": '0', "IS_SPARSE": '1'}
@@ -64,7 +64,7 @@ def test_simnet_bow(self):
6464
class TestDistSimnetBow2x2SparseAsync(TestDistBase):
6565
def _setup_config(self):
6666
self._sync_mode = False
67-
self._use_cuda = False
67+
self._enforce_place = "CPU"
6868

6969
def test_simnet_bow(self):
7070
need_envs = {"IS_DISTRIBUTED": '0', "IS_SPARSE": '1'}

python/paddle/fluid/tests/unittests/test_dist_text_classification.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,7 @@
2121
class TestDistTextClassification2x2(TestDistBase):
2222
def _setup_config(self):
2323
self._sync_mode = True
24-
self._use_cuda = False
24+
self._enforce_place = "CPU"
2525

2626
def test_text_classification(self):
2727
self.check_with_place("dist_text_classification.py", delta=1e-6)
@@ -30,7 +30,7 @@ def test_text_classification(self):
3030
class TestDistTextClassification2x2Async(TestDistBase):
3131
def _setup_config(self):
3232
self._sync_mode = False
33-
self._use_cuda = False
33+
self._enforce_place = "CPU"
3434

3535
def test_se_resnext(self):
3636
self.check_with_place("dist_text_classification.py", delta=100)

0 commit comments

Comments
 (0)