fix dist ut with place, test=develop (#13647)

seiriosPlus · gongweibao · commit b35239df2b62 · 2018-09-29T15:15:55.000+08:00
diff --git a/python/paddle/fluid/tests/unittests/dist_se_resnext.py b/python/paddle/fluid/tests/unittests/dist_se_resnext.py
@@ -247,7 +247,7 @@ def get_model(self, batch_size=2):
 
         # Reader
         train_reader = paddle.batch(
-            paddle.dataset.flowers.train(), batch_size=batch_size)
+            paddle.dataset.flowers.test(use_xmap=False), batch_size=batch_size)
         test_reader = paddle.batch(
             paddle.dataset.flowers.test(use_xmap=False), batch_size=batch_size)
 
diff --git a/python/paddle/fluid/tests/unittests/test_dist_base.py b/python/paddle/fluid/tests/unittests/test_dist_base.py
@@ -164,18 +164,30 @@ class TestDistBase(unittest.TestCase):
     def _setup_config(self):
         raise NotImplementedError("tests should have _setup_config implemented")
 
+    def _after_setup_config(self):
+        if self._enforce_place == "CPU":
+            self.__use_cuda = False
+        elif self._enforce_place == "GPU":
+            self.__use_cuda = True
+        else:
+            if fluid.core.is_compiled_with_cuda():
+                self.__use_cuda = True
+            else:
+                self.__use_cuda = False
+
     def setUp(self):
         self._trainers = 2
         self._pservers = 2
         self._ps_endpoints = "127.0.0.1:%s,127.0.0.1:%s" % (
             self._find_free_port(), self._find_free_port())
         self._python_interp = "python"
         self._sync_mode = True
-        self._use_cuda = True
+        self._enforce_place = None
         self._mem_opt = False
         self._use_reduce = False
         self._use_reader_alloc = True
         self._setup_config()
+        self._after_setup_config()
 
     def _find_free_port(self):
         with closing(socket.socket(socket.AF_INET, socket.SOCK_STREAM)) as s:
@@ -199,13 +211,10 @@ def start_pserver(self, model_file, check_error_log, required_envs):
             ps0_cmd += " --mem_opt"
             ps1_cmd += " --mem_opt"
 
-        ps0_pipe = subprocess.PIPE
-        ps1_pipe = subprocess.PIPE
-        if check_error_log:
-            print(ps0_cmd)
-            print(ps1_cmd)
-            ps0_pipe = open("/tmp/ps0_err.log", "wb")
-            ps1_pipe = open("/tmp/ps1_err.log", "wb")
+        print(ps0_cmd)
+        print(ps1_cmd)
+        ps0_pipe = open("/tmp/ps0_err.log", "wb")
+        ps1_pipe = open("/tmp/ps1_err.log", "wb")
 
         ps0_proc = subprocess.Popen(
             ps0_cmd.strip().split(" "),
@@ -218,10 +227,7 @@ def start_pserver(self, model_file, check_error_log, required_envs):
             stderr=ps1_pipe,
             env=required_envs)
 
-        if not check_error_log:
-            return ps0_proc, ps1_proc, None, None
-        else:
-            return ps0_proc, ps1_proc, ps0_pipe, ps1_pipe
+        return ps0_proc, ps1_proc, ps0_pipe, ps1_pipe
 
     def _wait_ps_ready(self, pid):
         retry_times = 50
@@ -242,15 +248,15 @@ def _run_local(self, model, envs, check_error_log):
 
         cmd = "%s %s --role trainer" % (self._python_interp, model)
 
-        if self._use_cuda:
+        if self.__use_cuda:
             cmd += " --use_cuda"
             env_local = {"CUDA_VISIBLE_DEVICES": "0"}
         else:
             env_local = {'CPU_NUM': '1'}
 
         envs.update(env_local)
 
-        if not check_error_log:
+        if check_error_log:
             err_log = open("/tmp/trainer.err.log", "wb")
             local_proc = subprocess.Popen(
                 cmd.split(" "),
@@ -264,7 +270,6 @@ def _run_local(self, model, envs, check_error_log):
                 stderr=subprocess.PIPE,
                 env=envs)
 
-        local_proc.wait()
         local_out, local_err = local_proc.communicate()
         local_ret = cpt.to_text(local_out)
 
@@ -305,7 +310,7 @@ def _run_cluster(self, model, envs, check_error_log):
         if self._use_reader_alloc:
             tr0_cmd += " --use_reader_alloc"
             tr1_cmd += " --use_reader_alloc"
-        if self._use_cuda:
+        if self.__use_cuda:
             tr0_cmd += " --use_cuda"
             tr1_cmd += " --use_cuda"
             env0 = {"CUDA_VISIBLE_DEVICES": "0"}
@@ -317,15 +322,10 @@ def _run_cluster(self, model, envs, check_error_log):
         env0.update(envs)
         env1.update(envs)
 
-        FNULL = open(os.devnull, 'w')
-
-        tr0_pipe = subprocess.PIPE
-        tr1_pipe = subprocess.PIPE
-        if check_error_log:
-            print("tr0_cmd:{}, env0: {}".format(tr0_cmd, env0))
-            print("tr1_cmd:{}, env1: {}".format(tr1_cmd, env1))
-            tr0_pipe = open("/tmp/tr0_err.log", "wb")
-            tr1_pipe = open("/tmp/tr1_err.log", "wb")
+        print("tr0_cmd:{}, env0: {}".format(tr0_cmd, env0))
+        print("tr1_cmd:{}, env1: {}".format(tr1_cmd, env1))
+        tr0_pipe = open("/tmp/tr0_err.log", "wb")
+        tr1_pipe = open("/tmp/tr1_err.log", "wb")
 
         tr0_proc = subprocess.Popen(
             tr0_cmd.strip().split(" "),
@@ -338,29 +338,22 @@ def _run_cluster(self, model, envs, check_error_log):
             stderr=tr1_pipe,
             env=env1)
 
-        tr0_proc.wait()
-        tr1_proc.wait()
-
         tr0_out, tr0_err = tr0_proc.communicate()
         tr0_loss_text = cpt.to_text(tr0_out)
         tr1_out, tr1_err = tr1_proc.communicate()
         tr1_loss_text = cpt.to_text(tr1_out)
 
         # close trainer file
-        if check_error_log:
-            tr0_pipe.close()
-            tr1_pipe.close()
+        tr0_pipe.close()
+        tr1_pipe.close()
 
-            ps0_pipe.close()
-            ps1_pipe.close()
+        ps0_pipe.close()
+        ps1_pipe.close()
         # FIXME: use terminate() instead of sigkill.
         os.kill(ps0.pid, signal.SIGKILL)
         os.kill(ps1.pid, signal.SIGKILL)
         ps0.terminate()
         ps1.terminate()
-        ps0.wait()
-        ps1.wait()
-        FNULL.close()
 
         # print log
         sys.stderr.write('trainer 0 stdout:\n %s\n' % tr0_loss_text)
@@ -385,6 +378,7 @@ def check_with_place(self,
             "LD_LIBRARY_PATH": os.getenv("LD_LIBRARY_PATH", ""),
             "FLAGS_fraction_of_gpu_memory_to_use": "0.15",
             "FLAGS_cudnn_deterministic": "1",
+            "http_proxy": ""
         }
 
         required_envs.update(need_envs)
diff --git a/python/paddle/fluid/tests/unittests/test_dist_ctr.py b/python/paddle/fluid/tests/unittests/test_dist_ctr.py
@@ -21,10 +21,11 @@
 class TestDistCTR2x2(TestDistBase):
     def _setup_config(self):
         self._sync_mode = True
-        self._use_cuda = False
+        self._enforce_place = "CPU"
 
-    def test_dist_ctr(self):
-        self.check_with_place("dist_ctr.py", delta=1e-7, check_error_log=False)
+
+def test_dist_ctr(self):
+    self.check_with_place("dist_ctr.py", delta=1e-7, check_error_log=False)
 
 
 if __name__ == "__main__":
diff --git a/python/paddle/fluid/tests/unittests/test_dist_se_resnext.py b/python/paddle/fluid/tests/unittests/test_dist_se_resnext.py
@@ -22,7 +22,7 @@ def _setup_config(self):
         self._sync_mode = True
         self._use_reader_alloc = False
 
-    def no_test_dist_train(self):
+    def test_dist_train(self):
         self.check_with_place("dist_se_resnext.py", delta=100)
 
 
@@ -40,7 +40,7 @@ def _setup_config(self):
         self._sync_mode = False
         self._use_reader_alloc = False
 
-    def no_test_dist_train(self):
+    def test_dist_train(self):
         self.check_with_place("dist_se_resnext.py", delta=100)
 
 
diff --git a/python/paddle/fluid/tests/unittests/test_dist_simnet_bow.py b/python/paddle/fluid/tests/unittests/test_dist_simnet_bow.py
@@ -22,7 +22,7 @@
 class TestDistSimnetBowDense2x2(TestDistBase):
     def _setup_config(self):
         self._sync_mode = True
-        self._use_cuda = False
+        self._enforce_place = "CPU"
 
     def test_simnet_bow(self):
         need_envs = {"IS_DISTRIBUTED": '0', "IS_SPARSE": '0'}
@@ -36,7 +36,7 @@ def test_simnet_bow(self):
 class TestDistSimnetBow2x2DenseAsync(TestDistBase):
     def _setup_config(self):
         self._sync_mode = False
-        self._use_cuda = False
+        self._enforce_place = "CPU"
 
     def test_simnet_bow(self):
         need_envs = {"IS_DISTRIBUTED": '0', "IS_SPARSE": '0'}
@@ -50,7 +50,7 @@ def test_simnet_bow(self):
 class TestDistSimnetBowSparse2x2(TestDistBase):
     def _setup_config(self):
         self._sync_mode = True
-        self._use_cuda = False
+        self._enforce_place = "CPU"
 
     def test_simnet_bow(self):
         need_envs = {"IS_DISTRIBUTED": '0', "IS_SPARSE": '1'}
@@ -64,7 +64,7 @@ def test_simnet_bow(self):
 class TestDistSimnetBow2x2SparseAsync(TestDistBase):
     def _setup_config(self):
         self._sync_mode = False
-        self._use_cuda = False
+        self._enforce_place = "CPU"
 
     def test_simnet_bow(self):
         need_envs = {"IS_DISTRIBUTED": '0', "IS_SPARSE": '1'}
diff --git a/python/paddle/fluid/tests/unittests/test_dist_text_classification.py b/python/paddle/fluid/tests/unittests/test_dist_text_classification.py
@@ -21,7 +21,7 @@
 class TestDistTextClassification2x2(TestDistBase):
     def _setup_config(self):
         self._sync_mode = True
-        self._use_cuda = False
+        self._enforce_place = "CPU"
 
     def test_text_classification(self):
         self.check_with_place("dist_text_classification.py", delta=1e-6)
@@ -30,7 +30,7 @@ def test_text_classification(self):
 class TestDistTextClassification2x2Async(TestDistBase):
     def _setup_config(self):
         self._sync_mode = False
-        self._use_cuda = False
+        self._enforce_place = "CPU"
 
     def test_se_resnext(self):
         self.check_with_place("dist_text_classification.py", delta=100)