Skip to content

Commit d021900

Browse files
authored
Cherry pick install check for multi gpu (#18245)
* test=develop, add add_multi_gpu_install_check (#18157) * test=develop, add add_multi_gpu_install_check * test=develop, refine warning doc * test=develop, refine warning doc * test=develop, refine warning doc * test=develop, support multi cpu * test=release/1.5, cherry-picked from develop
1 parent 0648376 commit d021900

File tree

2 files changed

+132
-24
lines changed

2 files changed

+132
-24
lines changed

python/paddle/fluid/install_check.py

Lines changed: 128 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -12,15 +12,50 @@
1212
# See the License for the specific language governing permissions and
1313
# limitations under the License.
1414

15-
from .framework import Program, program_guard, unique_name, default_startup_program
15+
import os
16+
from . import core
17+
18+
19+
def process_env():
20+
env = os.environ
21+
device_list = []
22+
if env.get('CUDA_VISIBLE_DEVICES') is not None:
23+
cuda_devices = env['CUDA_VISIBLE_DEVICES']
24+
if cuda_devices == "" or len(cuda_devices) == 0:
25+
os.environ['CUDA_VISIBLE_DEVICES'] = "0,1"
26+
device_list = [0, 1]
27+
elif len(cuda_devices) == 1:
28+
device_list.append(0)
29+
elif len(cuda_devices) > 1:
30+
for i in range(len(cuda_devices.split(","))):
31+
device_list.append(i)
32+
return device_list
33+
else:
34+
if core.get_cuda_device_count() > 1:
35+
os.environ['CUDA_VISIBLE_DEVICES'] = "0,1"
36+
return [0, 1]
37+
else:
38+
os.environ['CUDA_VISIBLE_DEVICES'] = "0"
39+
return [0]
40+
41+
42+
device_list = []
43+
if core.is_compiled_with_cuda():
44+
device_list = process_env()
45+
else:
46+
device_list = [0, 1] # for CPU 0,1
47+
48+
from .framework import Program, program_guard, unique_name
1649
from .param_attr import ParamAttr
1750
from .initializer import Constant
1851
from . import layers
1952
from . import backward
2053
from .dygraph import Layer, nn
2154
from . import executor
22-
55+
from . import optimizer
2356
from . import core
57+
from . import compiler
58+
import logging
2459
import numpy as np
2560

2661
__all__ = ['run_check']
@@ -45,25 +80,94 @@ def run_check():
4580
This func should not be called only if you need to verify installation
4681
'''
4782
print("Running Verify Fluid Program ... ")
48-
prog = Program()
49-
startup_prog = Program()
50-
scope = core.Scope()
51-
with executor.scope_guard(scope):
52-
with program_guard(prog, startup_prog):
53-
with unique_name.guard():
54-
np_inp = np.array([[1.0, 2.0], [3.0, 4.0]], dtype=np.float32)
55-
inp = layers.data(
56-
name="inp", shape=[2, 2], append_batch_size=False)
57-
simple_layer = SimpleLayer("simple_layer")
58-
out = simple_layer(inp)
59-
param_grads = backward.append_backward(
60-
out, parameter_list=[simple_layer._fc1._w.name])[0]
61-
exe = executor.Executor(core.CPUPlace(
62-
) if not core.is_compiled_with_cuda() else core.CUDAPlace(0))
63-
exe.run(default_startup_program())
64-
exe.run(feed={inp.name: np_inp},
65-
fetch_list=[out.name, param_grads[1].name])
66-
67-
print(
68-
"Your Paddle Fluid is installed successfully! Let's start deep Learning with Paddle Fluid now"
69-
)
83+
use_cuda = False if not core.is_compiled_with_cuda() else True
84+
place = core.CPUPlace() if not core.is_compiled_with_cuda(
85+
) else core.CUDAPlace(0)
86+
np_inp_single = np.array([[1.0, 2.0], [3.0, 4.0]], dtype=np.float32)
87+
inp = []
88+
for i in range(len(device_list)):
89+
inp.append(np_inp_single)
90+
np_inp_muti = np.array(inp)
91+
np_inp_muti = np_inp_muti.reshape(len(device_list), 2, 2)
92+
93+
def test_parallerl_exe():
94+
train_prog = Program()
95+
startup_prog = Program()
96+
scope = core.Scope()
97+
if not use_cuda:
98+
os.environ['CPU_NUM'] = "2"
99+
with executor.scope_guard(scope):
100+
with program_guard(train_prog, startup_prog):
101+
with unique_name.guard():
102+
places = []
103+
build_strategy = compiler.BuildStrategy()
104+
build_strategy.enable_inplace = True
105+
build_strategy.memory_optimize = True
106+
inp = layers.data(name="inp", shape=[2, 2])
107+
simple_layer = SimpleLayer("simple_layer")
108+
out = simple_layer(inp)
109+
exe = executor.Executor(place)
110+
if use_cuda:
111+
for i in device_list:
112+
places.append(core.CUDAPlace(i))
113+
else:
114+
places = [core.CPUPlace(), core.CPUPlace()]
115+
loss = layers.mean(out)
116+
loss.persistable = True
117+
optimizer.SGD(learning_rate=0.01).minimize(loss)
118+
startup_prog.random_seed = 1
119+
compiled_prog = compiler.CompiledProgram(
120+
train_prog).with_data_parallel(
121+
build_strategy=build_strategy,
122+
loss_name=loss.name,
123+
places=places)
124+
exe.run(startup_prog)
125+
126+
exe.run(compiled_prog,
127+
feed={inp.name: np_inp_muti},
128+
fetch_list=[loss.name])
129+
130+
def test_simple_exe():
131+
train_prog = Program()
132+
startup_prog = Program()
133+
scope = core.Scope()
134+
if not use_cuda:
135+
os.environ['CPU_NUM'] = "1"
136+
with executor.scope_guard(scope):
137+
with program_guard(train_prog, startup_prog):
138+
with unique_name.guard():
139+
inp0 = layers.data(
140+
name="inp", shape=[2, 2], append_batch_size=False)
141+
simple_layer0 = SimpleLayer("simple_layer")
142+
out0 = simple_layer0(inp0)
143+
param_grads = backward.append_backward(
144+
out0, parameter_list=[simple_layer0._fc1._w.name])[0]
145+
exe0 = executor.Executor(core.CPUPlace()
146+
if not core.is_compiled_with_cuda()
147+
else core.CUDAPlace(0))
148+
exe0.run(startup_prog)
149+
exe0.run(feed={inp0.name: np_inp_single},
150+
fetch_list=[out0.name, param_grads[1].name])
151+
152+
test_simple_exe()
153+
154+
print("Your Paddle Fluid works well on SINGLE GPU or CPU.")
155+
try:
156+
test_parallerl_exe()
157+
print("Your Paddle Fluid works well on MUTIPLE GPU or CPU.")
158+
print(
159+
"Your Paddle Fluid is installed successfully! Let's start deep Learning with Paddle Fluid now"
160+
)
161+
except Exception as e:
162+
logging.warning(
163+
"Your Paddle Fluid has some problem with multiple GPU. This may be caused by:"
164+
"\n 1. There is only 1 GPU visible on your Device;"
165+
"\n 2. No.1 or No.2 GPU or both of them are occupied now"
166+
"\n 3. Wrong installation of NVIDIA-NCCL2, please follow instruction on https://github.com/NVIDIA/nccl-tests "
167+
"\n to test your NCCL, or reinstall it following https://docs.nvidia.com/deeplearning/sdk/nccl-install-guide/index.html"
168+
)
169+
170+
print("\n Original Error is: {}".format(e))
171+
print(
172+
"Your Paddle Fluid is installed successfully ONLY for SINGLE GPU or CPU! "
173+
"\n Let's start deep Learning with Paddle Fluid now")

python/paddle/fluid/tests/unittests/CMakeLists.txt

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -116,6 +116,7 @@ list(REMOVE_ITEM TEST_OPS test_imperative_mnist)
116116
list(REMOVE_ITEM TEST_OPS test_ir_memory_optimize_transformer)
117117
list(REMOVE_ITEM TEST_OPS test_layers)
118118
list(REMOVE_ITEM TEST_OPS test_imperative_ocr_attention_model)
119+
list(REMOVE_ITEM TEST_OPS test_install_check)
119120

120121
# Some ops need to check results when gc is enabled
121122
# Currently, only ops that register NoNeedBufferVarsInference need to do this test
@@ -172,6 +173,9 @@ py_test_modules(test_imperative_mnist_sorted_gradient MODULES test_imperative_mn
172173
py_test_modules(test_imperative_se_resnext MODULES test_imperative_se_resnext ENVS
173174
FLAGS_cudnn_deterministic=1 SERIAL)
174175
set_tests_properties(test_imperative_se_resnext PROPERTIES LABELS "RUN_TYPE=EXCLUSIVE")
176+
py_test_modules(test_install_check MODULES test_install_check ENVS
177+
FLAGS_cudnn_deterministic=1 SERIAL)
178+
set_tests_properties(test_install_check PROPERTIES LABELS "RUN_TYPE=DIST")
175179

176180
if(WITH_DISTRIBUTE)
177181
py_test_modules(test_dist_train MODULES test_dist_train)

0 commit comments

Comments
 (0)