Skip to content

Commit da4129f

Browse files
committed
fix dist ut with place, test=develop (#13647)
1 parent e9adfc4 commit da4129f

File tree

5 files changed

+41
-46
lines changed

5 files changed

+41
-46
lines changed

python/paddle/fluid/tests/unittests/dist_se_resnext.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -247,7 +247,7 @@ def get_model(self, batch_size=2):
247247

248248
# Reader
249249
train_reader = paddle.batch(
250-
paddle.dataset.flowers.train(), batch_size=batch_size)
250+
paddle.dataset.flowers.test(use_xmap=False), batch_size=batch_size)
251251
test_reader = paddle.batch(
252252
paddle.dataset.flowers.test(use_xmap=False), batch_size=batch_size)
253253

python/paddle/fluid/tests/unittests/test_dist_base.py

Lines changed: 30 additions & 36 deletions
Original file line numberDiff line numberDiff line change
@@ -166,18 +166,30 @@ class TestDistBase(unittest.TestCase):
166166
def _setup_config(self):
167167
raise NotImplementedError("tests should have _setup_config implemented")
168168

169+
def _after_setup_config(self):
170+
if self._enforce_place == "CPU":
171+
self.__use_cuda = False
172+
elif self._enforce_place == "GPU":
173+
self.__use_cuda = True
174+
else:
175+
if fluid.core.is_compiled_with_cuda():
176+
self.__use_cuda = True
177+
else:
178+
self.__use_cuda = False
179+
169180
def setUp(self):
170181
self._trainers = 2
171182
self._pservers = 2
172183
self._ps_endpoints = "127.0.0.1:%s,127.0.0.1:%s" % (
173184
self._find_free_port(), self._find_free_port())
174185
self._python_interp = "python"
175186
self._sync_mode = True
176-
self._use_cuda = True
187+
self._enforce_place = None
177188
self._mem_opt = False
178189
self._use_reduce = False
179190
self._use_reader_alloc = True
180191
self._setup_config()
192+
self._after_setup_config()
181193

182194
def _find_free_port(self):
183195
with closing(socket.socket(socket.AF_INET, socket.SOCK_STREAM)) as s:
@@ -201,13 +213,10 @@ def start_pserver(self, model_file, check_error_log, required_envs):
201213
ps0_cmd += " --mem_opt"
202214
ps1_cmd += " --mem_opt"
203215

204-
ps0_pipe = subprocess.PIPE
205-
ps1_pipe = subprocess.PIPE
206-
if check_error_log:
207-
print(ps0_cmd)
208-
print(ps1_cmd)
209-
ps0_pipe = open("/tmp/ps0_err.log", "wb")
210-
ps1_pipe = open("/tmp/ps1_err.log", "wb")
216+
print(ps0_cmd)
217+
print(ps1_cmd)
218+
ps0_pipe = open("/tmp/ps0_err.log", "wb")
219+
ps1_pipe = open("/tmp/ps1_err.log", "wb")
211220

212221
ps0_proc = subprocess.Popen(
213222
ps0_cmd.strip().split(" "),
@@ -220,10 +229,7 @@ def start_pserver(self, model_file, check_error_log, required_envs):
220229
stderr=ps1_pipe,
221230
env=required_envs)
222231

223-
if not check_error_log:
224-
return ps0_proc, ps1_proc, None, None
225-
else:
226-
return ps0_proc, ps1_proc, ps0_pipe, ps1_pipe
232+
return ps0_proc, ps1_proc, ps0_pipe, ps1_pipe
227233

228234
def _wait_ps_ready(self, pid):
229235
retry_times = 50
@@ -244,15 +250,15 @@ def _run_local(self, model, envs, check_error_log):
244250

245251
cmd = "%s %s --role trainer" % (self._python_interp, model)
246252

247-
if self._use_cuda:
253+
if self.__use_cuda:
248254
cmd += " --use_cuda"
249255
env_local = {"CUDA_VISIBLE_DEVICES": "0"}
250256
else:
251257
env_local = {'CPU_NUM': '1'}
252258

253259
envs.update(env_local)
254260

255-
if not check_error_log:
261+
if check_error_log:
256262
err_log = open("/tmp/trainer.err.log", "wb")
257263
local_proc = subprocess.Popen(
258264
cmd.split(" "),
@@ -266,7 +272,6 @@ def _run_local(self, model, envs, check_error_log):
266272
stderr=subprocess.PIPE,
267273
env=envs)
268274

269-
local_proc.wait()
270275
local_out, local_err = local_proc.communicate()
271276
local_ret = cpt.to_text(local_out)
272277

@@ -307,7 +312,7 @@ def _run_cluster(self, model, envs, check_error_log):
307312
if self._use_reader_alloc:
308313
tr0_cmd += " --use_reader_alloc"
309314
tr1_cmd += " --use_reader_alloc"
310-
if self._use_cuda:
315+
if self.__use_cuda:
311316
tr0_cmd += " --use_cuda"
312317
tr1_cmd += " --use_cuda"
313318
env0 = {"CUDA_VISIBLE_DEVICES": "0"}
@@ -319,15 +324,10 @@ def _run_cluster(self, model, envs, check_error_log):
319324
env0.update(envs)
320325
env1.update(envs)
321326

322-
FNULL = open(os.devnull, 'w')
323-
324-
tr0_pipe = subprocess.PIPE
325-
tr1_pipe = subprocess.PIPE
326-
if check_error_log:
327-
print("tr0_cmd:{}, env0: {}".format(tr0_cmd, env0))
328-
print("tr1_cmd:{}, env1: {}".format(tr1_cmd, env1))
329-
tr0_pipe = open("/tmp/tr0_err.log", "wb")
330-
tr1_pipe = open("/tmp/tr1_err.log", "wb")
327+
print("tr0_cmd:{}, env0: {}".format(tr0_cmd, env0))
328+
print("tr1_cmd:{}, env1: {}".format(tr1_cmd, env1))
329+
tr0_pipe = open("/tmp/tr0_err.log", "wb")
330+
tr1_pipe = open("/tmp/tr1_err.log", "wb")
331331

332332
tr0_proc = subprocess.Popen(
333333
tr0_cmd.strip().split(" "),
@@ -340,29 +340,22 @@ def _run_cluster(self, model, envs, check_error_log):
340340
stderr=tr1_pipe,
341341
env=env1)
342342

343-
tr0_proc.wait()
344-
tr1_proc.wait()
345-
346343
tr0_out, tr0_err = tr0_proc.communicate()
347344
tr0_loss_text = cpt.to_text(tr0_out)
348345
tr1_out, tr1_err = tr1_proc.communicate()
349346
tr1_loss_text = cpt.to_text(tr1_out)
350347

351348
# close trainer file
352-
if check_error_log:
353-
tr0_pipe.close()
354-
tr1_pipe.close()
349+
tr0_pipe.close()
350+
tr1_pipe.close()
355351

356-
ps0_pipe.close()
357-
ps1_pipe.close()
352+
ps0_pipe.close()
353+
ps1_pipe.close()
358354
# FIXME: use terminate() instead of sigkill.
359355
os.kill(ps0.pid, signal.SIGKILL)
360356
os.kill(ps1.pid, signal.SIGKILL)
361357
ps0.terminate()
362358
ps1.terminate()
363-
ps0.wait()
364-
ps1.wait()
365-
FNULL.close()
366359

367360
# print log
368361
sys.stderr.write('trainer 0 stdout:\n %s\n' % tr0_loss_text)
@@ -387,6 +380,7 @@ def check_with_place(self,
387380
"LD_LIBRARY_PATH": os.getenv("LD_LIBRARY_PATH", ""),
388381
"FLAGS_fraction_of_gpu_memory_to_use": "0.15",
389382
"FLAGS_cudnn_deterministic": "1",
383+
"http_proxy": ""
390384
}
391385

392386
required_envs.update(need_envs)

python/paddle/fluid/tests/unittests/test_dist_ctr.py

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -21,10 +21,11 @@
2121
class TestDistCTR2x2(TestDistBase):
2222
def _setup_config(self):
2323
self._sync_mode = True
24-
self._use_cuda = False
24+
self._enforce_place = "CPU"
2525

26-
def test_dist_ctr(self):
27-
self.check_with_place("dist_ctr.py", delta=1e-7, check_error_log=False)
26+
27+
def test_dist_ctr(self):
28+
self.check_with_place("dist_ctr.py", delta=1e-7, check_error_log=False)
2829

2930

3031
if __name__ == "__main__":

python/paddle/fluid/tests/unittests/test_dist_simnet_bow.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,7 @@
2222
class TestDistSimnetBowDense2x2(TestDistBase):
2323
def _setup_config(self):
2424
self._sync_mode = True
25-
self._use_cuda = False
25+
self._enforce_place = "CPU"
2626

2727
def test_simnet_bow(self):
2828
need_envs = {"IS_DISTRIBUTED": '0', "IS_SPARSE": '0'}
@@ -36,7 +36,7 @@ def test_simnet_bow(self):
3636
class TestDistSimnetBow2x2DenseAsync(TestDistBase):
3737
def _setup_config(self):
3838
self._sync_mode = False
39-
self._use_cuda = False
39+
self._enforce_place = "CPU"
4040

4141
def test_simnet_bow(self):
4242
need_envs = {"IS_DISTRIBUTED": '0', "IS_SPARSE": '0'}
@@ -50,7 +50,7 @@ def test_simnet_bow(self):
5050
class TestDistSimnetBowSparse2x2(TestDistBase):
5151
def _setup_config(self):
5252
self._sync_mode = True
53-
self._use_cuda = False
53+
self._enforce_place = "CPU"
5454

5555
def test_simnet_bow(self):
5656
need_envs = {"IS_DISTRIBUTED": '0', "IS_SPARSE": '1'}
@@ -64,7 +64,7 @@ def test_simnet_bow(self):
6464
class TestDistSimnetBow2x2SparseAsync(TestDistBase):
6565
def _setup_config(self):
6666
self._sync_mode = False
67-
self._use_cuda = False
67+
self._enforce_place = "CPU"
6868

6969
def test_simnet_bow(self):
7070
need_envs = {"IS_DISTRIBUTED": '0', "IS_SPARSE": '1'}

python/paddle/fluid/tests/unittests/test_dist_text_classification.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,7 @@
2121
class TestDistTextClassification2x2(TestDistBase):
2222
def _setup_config(self):
2323
self._sync_mode = True
24-
self._use_cuda = False
24+
self._enforce_place = "CPU"
2525

2626
def test_text_classification(self):
2727
self.check_with_place("dist_text_classification.py", delta=1e-6)
@@ -30,7 +30,7 @@ def test_text_classification(self):
3030
class TestDistTextClassification2x2Async(TestDistBase):
3131
def _setup_config(self):
3232
self._sync_mode = False
33-
self._use_cuda = False
33+
self._enforce_place = "CPU"
3434

3535
def test_se_resnext(self):
3636
self.check_with_place("dist_text_classification.py", delta=100)

0 commit comments

Comments
 (0)