Skip to content
This repository was archived by the owner on Nov 17, 2023. It is now read-only.

Commit fc54fab

Browse files
authored
[FEATURE] Add g5 instance to CI (#20876)
* Add g5 instance to jenkinsfiles where both p3 and g4 are mentioned * Remove reference to non-existent restricted-mxnetlinux-gpu-g5 * Enable unittest job on g5 * Fix Jenkinsfile_unix_gpu syntax * Include A10G arch 86 in build for g5 * Update is_TF32_enabled() for SM arch > 80 * Remove gpu arch 86 from centos builds on cuda 10 * Fix test_convolution_{grouping,dilated_impulse_response}, test_np_linalg_qr * Fix test_convolution_grouping on A100 * Fix test_rnn_unroll_variant_length * Fix test_convolution_dilated_impulse_response * Skip test_np_standard_binary_funcs test of 0-dim array broadcast * Temporarily add '-s' to pytest cpu tests * Revert "Temporarily add '-s' to pytest cpu tests" This reverts commit 4a9056a. * Improve test_rnn_layers_fp{16,32} invocation * Pin MarkupSafe==2.0.1 to avoid soft_unicode import failure * Run test_rnn_layers_fp32 only when cuDNN is present * Fix potential out-of-bounds write in count_sketch.cu * Revert "Pin MarkupSafe==2.0.1 to avoid soft_unicode import failure" This reverts commit ae17b1f.
1 parent 13b8690 commit fc54fab

File tree

10 files changed

+77
-32
lines changed

10 files changed

+77
-32
lines changed

ci/Jenkinsfile_utils.groovy

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -250,6 +250,7 @@ def assign_node_labels(args) {
250250
NODE_LINUX_CPU = args.linux_cpu
251251
NODE_LINUX_GPU = args.linux_gpu
252252
NODE_LINUX_GPU_G4 = args.linux_gpu_g4
253+
NODE_LINUX_GPU_G5 = args.linux_gpu_g5
253254
NODE_LINUX_GPU_P3 = args.linux_gpu_p3
254255
NODE_WINDOWS_CPU = args.windows_cpu
255256
NODE_WINDOWS_GPU = args.windows_gpu

ci/docker/runtime_functions.sh

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -22,8 +22,11 @@
2222

2323
set -ex
2424

25-
CI_CUDA_COMPUTE_CAPABILITIES="-gencode=arch=compute_52,code=sm_52 -gencode=arch=compute_70,code=sm_70"
26-
CI_CMAKE_CUDA_ARCH="5.2 7.0"
25+
# compute capabilities for CI instances supported by CUDA 10.x (i.e. p3, g4)
26+
CI_CMAKE_CUDA10_ARCH="5.2 7.5"
27+
28+
# compute capabilities for CI instances supported by CUDA >= 11.1 (i.e. p3, g4, g5)
29+
CI_CMAKE_CUDA_ARCH="5.2 7.5 8.6"
2730

2831
clean_repo() {
2932
set -ex
@@ -298,7 +301,7 @@ build_centos7_gpu() {
298301
-DUSE_BLAS=Open \
299302
-DUSE_ONEDNN=ON \
300303
-DUSE_CUDA=ON \
301-
-DMXNET_CUDA_ARCH="$CI_CMAKE_CUDA_ARCH" \
304+
-DMXNET_CUDA_ARCH="$CI_CMAKE_CUDA10_ARCH" \
302305
-DUSE_DIST_KVSTORE=ON \
303306
-DBUILD_EXTENSION_PATH=/work/mxnet/example/extensions/lib_external_ops \
304307
-DUSE_INT64_TENSOR_SIZE=OFF \

ci/jenkins/Jenkins_steps.groovy

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -716,6 +716,22 @@ def test_unix_python3_gpu(lib_name) {
716716
}]
717717
}
718718

719+
def test_unix_python3_ampere_gpu(lib_name) {
720+
return ['Python3: Ampere-GPU': {
721+
node(NODE_LINUX_GPU_G5) {
722+
ws('workspace/ut-python3-gpu') {
723+
try {
724+
utils.unpack_and_init(lib_name, mx_lib_cython)
725+
python3_gpu_ut_cython('ubuntu_gpu_cu111')
726+
utils.publish_test_coverage()
727+
} finally {
728+
utils.collect_test_results_unix('tests_gpu.xml', 'tests_python3_ampere_gpu.xml')
729+
}
730+
}
731+
}
732+
}]
733+
}
734+
719735
def test_unix_python3_debug_cpu() {
720736
return ['Python3: CPU debug': {
721737
node(NODE_LINUX_CPU) {

ci/jenkins/Jenkinsfile_unix_gpu

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -29,7 +29,7 @@ node('utility') {
2929
utils = load('ci/Jenkinsfile_utils.groovy')
3030
custom_steps = load('ci/jenkins/Jenkins_steps.groovy')
3131
}
32-
utils.assign_node_labels(utility: 'utility', linux_cpu: 'mxnetlinux-cpu', linux_gpu: 'mxnetlinux-gpu', linux_gpu_p3: 'mxnetlinux-gpu-p3', linux_gpu_g4: 'mxnetlinux-gpu-g4')
32+
utils.assign_node_labels(utility: 'utility', linux_cpu: 'mxnetlinux-cpu', linux_gpu: 'mxnetlinux-gpu', linux_gpu_p3: 'mxnetlinux-gpu-p3', linux_gpu_g4: 'mxnetlinux-gpu-g4', linux_gpu_g5: 'mxnetlinux-gpu-g5')
3333

3434
utils.main_wrapper(
3535
core_logic: {
@@ -44,6 +44,7 @@ core_logic: {
4444

4545
utils.parallel_stage('Tests', [
4646
custom_steps.test_unix_python3_gpu('gpu'),
47+
custom_steps.test_unix_python3_ampere_gpu('gpu'),
4748
custom_steps.test_unix_python3_onednn_gpu('onednn_gpu'),
4849
custom_steps.test_unix_python3_onednn_nocudnn_gpu('onednn_gpu_nocudnn'),
4950
custom_steps.test_unix_cpp_package_gpu('gpu'),

python/mxnet/test_utils.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -112,15 +112,15 @@ def effective_dtype(dat):
112112
----------
113113
dat : np.ndarray or mx.nd.array or mx.np.ndarray
114114
"""
115-
# On arch 80 gpus, a float32-io gemm or conv op will trim the mantissa of data
116-
# inputs to be of comparable precision to a float16, so float16 becomes the
115+
# On arch 80 gpus or later, a float32-io gemm or conv op will trim the mantissa of
116+
# data inputs to be of comparable precision to a float16, so float16 becomes the
117117
# 'effective dtype' for tolerance tests involving such op outputs.
118118

119119
# Is TF32 enabled in the device (the default on arch 80 GPUs)
120120
def is_TF32_enabled(device):
121121
try:
122122
return (device.device_type == 'gpu' and
123-
get_cuda_compute_capability(device) == 80 and
123+
get_cuda_compute_capability(device) >= 80 and
124124
os.environ.get('NVIDIA_TF32_OVERRIDE') != '0')
125125
except: # pylint: disable=bare-except
126126
return False

src/operator/contrib/count_sketch.cu

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -93,6 +93,9 @@ __global__ void sketch_backward_kernel(const int nthreads,
9393
// only calculate gradient regarding x
9494
// can also calculate gradient regarding s if needed
9595
const int index = blockIdx.x * blockDim.x + threadIdx.x;
96+
if (index >= nthreads) {
97+
return;
98+
}
9699
const int i_indim = index % in_dim;
97100
const int i_sample = index / in_dim;
98101
const int i_outdim = i_sample * out_dim + h[i_indim];

tests/python/gpu/test_operator_gpu.py

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,7 @@
2727
import mxnet.ndarray.sparse as mxsps
2828
from mxnet.test_utils import check_consistency, set_default_device, assert_almost_equal, assert_allclose
2929
from mxnet.test_utils import check_symbolic_forward, check_symbolic_backward, discard_stderr
30-
from mxnet.test_utils import default_device, rand_shape_2d, rand_ndarray, same, environment, get_rtc_compile_opts
30+
from mxnet.test_utils import default_device, rand_shape_2d, rand_ndarray, same, environment, get_rtc_compile_opts, get_cuda_compute_capability
3131
from mxnet.base import MXNetError
3232
from mxnet import autograd
3333

@@ -54,6 +54,13 @@
5454

5555
set_default_device(mx.gpu(0))
5656

57+
# For info purposes, log GPU compute cababilities. Run serially so output appears in log.
58+
@pytest.mark.serial
59+
def test_report_compute_capabilities(capsys):
60+
with capsys.disabled():
61+
sys.stdout.write('= {} '.format(
62+
[get_cuda_compute_capability(mx.gpu(i)) for i in range(mx.device.num_gpus())] ))
63+
5764
def check_countsketch(in_dim,out_dim,n):
5865
data = mx.sym.Variable("data")
5966
h = mx.sym.Variable("h")

tests/python/unittest/test_gluon_rnn.py

Lines changed: 13 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -606,7 +606,8 @@ def check_rnn_layer_forward(layer, inputs, states=None, run_only=False, device=m
606606

607607

608608
@mx.util.use_np
609-
def run_rnn_layers(dtype, dtype2, device=mx.cpu()):
609+
def run_rnn_layers(dtype, dtype2):
610+
device = default_device()
610611

611612
check_rnn_layer_forward(gluon.rnn.RNN(10, 2, dtype=dtype), mx.np.ones((8, 3, 20), dtype=dtype), device=device)
612613
check_rnn_layer_forward(gluon.rnn.RNN(10, 2, dtype=dtype, bidirectional=True), mx.np.ones((8, 3, 20), dtype=dtype), mx.np.ones((4, 3, 10), dtype=dtype), device=device)
@@ -668,15 +669,18 @@ def run_rnn_layers(dtype, dtype2, device=mx.cpu()):
668669
out.backward()
669670
out = out.asnumpy()
670671

672+
@assert_raises_cudnn_not_satisfied(min_version='5.1.10')
671673
@pytest.mark.serial
672674
def test_rnn_layers_fp32():
673675
run_rnn_layers('float32', 'float32')
674676

675677
@assert_raises_cudnn_not_satisfied(min_version='5.1.10')
676-
@pytest.mark.skipif(mx.device.num_gpus() == 0, reason="RNN FP16 only implemented for GPU for now")
677678
@pytest.mark.serial
678679
def test_rnn_layers_fp16():
679-
run_rnn_layers('float16', 'float32', mx.gpu())
680+
# Dynamic skip condition is best handled this way, rather than with pytest.mark.skipIf
681+
if default_device().device_type == 'cpu':
682+
pytest.skip('RNN FP16 only implemented for GPU for now')
683+
run_rnn_layers('float16', 'float32')
680684

681685

682686
def check_rnn_consistency(fused_layer, stack_layer, loss, mode, num_layers, input_size, hidden_size, bidirectional=False, rtol=1e-2, atol=1e-4):
@@ -844,14 +848,12 @@ def test_rnn_unroll_variant_length():
844848
inputs=data_nd[i:(i+1), :ele_length, :],
845849
merge_outputs=True,
846850
layout='NTC')
847-
assert_allclose(ele_out.asnumpy(), outs[i:(i+1), :ele_length, :].asnumpy(),
848-
atol=1E-4, rtol=1E-4)
851+
assert_almost_equal(ele_out, outs[i:(i+1), :ele_length, :])
849852
if ele_length < max_length:
850853
# Check the padded outputs are all zero
851-
assert_allclose(outs[i:(i+1), ele_length:max_length, :].asnumpy(), 0)
854+
assert_almost_equal(outs[i:(i+1), ele_length:max_length, :], 0)
852855
for valid_out_state, gt_state in zip(states, ele_states):
853-
assert_allclose(valid_out_state[i:(i+1)].asnumpy(), gt_state.asnumpy(),
854-
atol=1E-4, rtol=1E-4)
856+
assert_almost_equal(valid_out_state[i:(i+1)], gt_state)
855857

856858
# Test for TNC layout
857859
data_nd = mx.np.random.normal(0, 1, size=(max_length, batch_size, 20))
@@ -864,14 +866,12 @@ def test_rnn_unroll_variant_length():
864866
inputs=data_nd[:ele_length, i:(i+1), :],
865867
merge_outputs=True,
866868
layout='TNC')
867-
assert_allclose(ele_out.asnumpy(), outs[:ele_length, i:(i + 1), :].asnumpy(),
868-
atol=1E-4, rtol=1E-4)
869+
assert_almost_equal(ele_out, outs[:ele_length, i:(i + 1), :])
869870
if ele_length < max_length:
870871
# Check the padded outputs are all zero
871-
assert_allclose(outs[ele_length:max_length, i:(i+1), :].asnumpy(), 0)
872+
assert_almost_equal(outs[ele_length:max_length, i:(i+1), :], 0)
872873
for valid_out_state, gt_state in zip(states, ele_states):
873-
assert_allclose(valid_out_state[i:(i+1)].asnumpy(), gt_state.asnumpy(),
874-
atol=1E-4, rtol=1E-4)
874+
assert_almost_equal(valid_out_state[i:(i+1)], gt_state)
875875

876876

877877
def test_cell_fill_shape():

tests/python/unittest/test_numpy_op.py

Lines changed: 9 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -6477,6 +6477,9 @@ def check_qr(q, r, a_np):
64776477

64786478
data_np = onp.array(data_np, dtype=dtype)
64796479
data = np.array(data_np, dtype=dtype)
6480+
if effective_dtype(data) == onp.dtype(np.float16):
6481+
print('Skipping test on this platform: {} has a float16 effective dtype'.format(dtype))
6482+
pytest.skip()
64806483

64816484
data.attach_grad()
64826485
with mx.autograd.record():
@@ -11712,8 +11715,12 @@ def array_values(low, high, shape):
1171211715
((3, 1), (3, 0)),
1171311716
((0, 2), (1, 2)),
1171411717
((2, 3, 4), (3, 1)),
11715-
((2, 3), ()),
11716-
((), (2, 3))
11718+
# MXNet numpy does not match original numpy behavior when broadcasting 0-dim arrays.
11719+
# See https://github.com/apache/incubator-mxnet/issues/20898.
11720+
# ((2, 3), ()),
11721+
# ((), (2, 3))
11722+
((2, 3), (1,)),
11723+
((1,), (2, 3))
1171711724
])
1171811725
def test_np_standard_binary_funcs(func, func2, promoted, dtypes, ref_grad_a, ref_grad_b, low, high, lshape, rshape):
1171911726
class TestStandardBinary(HybridBlock):

tests/python/unittest/test_operator.py

Lines changed: 16 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1724,6 +1724,7 @@ def np_groupnorm_grad(ograd, data, gamma, beta, mean, std, num_groups, eps):
17241724
atol=5e-2 if dtype == np.float16 else 1e-4, dtype=dtype)
17251725

17261726

1727+
@pytest.mark.serial
17271728
def test_convolution_grouping():
17281729
for dim in [1, 2, 3]:
17291730
num_filter = 4
@@ -1745,15 +1746,15 @@ def test_convolution_grouping():
17451746
exe1 = y1._simple_bind(default_device(), x=shape)
17461747
exe2 = y2._simple_bind(default_device(), x=shape, w=(num_filter, shape[1]//num_group) + kernel, b=(num_filter,))
17471748
for arr1, arr2 in zip(exe1.arg_arrays, exe2.arg_arrays):
1748-
arr1[:] = np.float32(np.random.normal(size=arr1.shape))
1749+
arr1[:] = np.random.normal(size=arr1.shape).astype(effective_dtype(mx.nd.array([1.,])))
17491750
arr2[:] = arr1
17501751
exe1.forward(is_train=True)
17511752
exe1.backward(exe1.outputs[0])
17521753
exe2.forward(is_train=True)
17531754
exe2.backward(exe2.outputs[0])
17541755

17551756
for arr1, arr2 in zip(exe1.outputs + exe1.grad_arrays, exe2.outputs + exe2.grad_arrays):
1756-
np.testing.assert_allclose(arr1.asnumpy(), arr2.asnumpy(), rtol=1e-3, atol=1e-3)
1757+
assert_almost_equal(arr1, arr2)
17571758

17581759

17591760
@pytest.mark.skip(reason="Flaky test https://github.com/apache/incubator-mxnet/issues/14052")
@@ -2216,7 +2217,8 @@ def test_bxor(a, b):
22162217
test_bor(a, b)
22172218
test_bxor(a, b)
22182219

2219-
def test_run_convolution_dilated_impulse_response(dil=(1,1), kernel_shape=(3,3), verbose=False):
2220+
2221+
def run_convolution_dilated_impulse_response(dil, kernel_shape, tol):
22202222
dim = len(dil)
22212223
assert(len(kernel_shape) == dim)
22222224
# Input for spike response
@@ -2259,7 +2261,7 @@ def test_run_convolution_dilated_impulse_response(dil=(1,1), kernel_shape=(3,3),
22592261
out_o = be.outputs[0].asnumpy()
22602262
assert_allclose(out_o[center],np.prod(kernel_shape),atol=1e-5)
22612263

2262-
rnd_kernel_s = np.random.uniform(low=0.0, high=1.0, size=tuple([1,1]+list(kernel_shape))).astype(np.float32)
2264+
rnd_kernel_s = np.random.uniform(low=-0.5, high=0.5, size=tuple([1,1]+list(kernel_shape))).astype(np.float32)
22632265
impulse_error = mx.nd.array(out_o/np.sum(out_o)) # This should be 1.0 at [0,0,16,16]
22642266
rnd_kernel = mx.nd.array(rnd_kernel_s)
22652267

@@ -2282,22 +2284,27 @@ def test_run_convolution_dilated_impulse_response(dil=(1,1), kernel_shape=(3,3),
22822284
be.forward(True)
22832285
out = be.outputs[0].asnumpy()
22842286
# Now do a simple check of the kernel gradient
2285-
assert(out[center] - np.sum(kernel_gradient) - out_orig[center] < 0.001)
2286-
2287+
d = np.abs(out[center] - np.sum(kernel_gradient) - out_orig[center])
2288+
assert d < tol, f'd: {d}'
22872289

2290+
@pytest.mark.serial
22882291
def test_convolution_dilated_impulse_response():
2292+
tol = 1e-3
22892293
# 1D
22902294
for dil in [ (1,), (2,), (3,) ]:
22912295
for ks in [ (1,), (2,), (3,), (4,)]:
2292-
test_run_convolution_dilated_impulse_response(dil=dil, kernel_shape=ks)
2296+
run_convolution_dilated_impulse_response(dil=dil, kernel_shape=ks, tol=tol)
22932297
# 2D
22942298
for dil in [ (1,1), (2,2), (3,3) ]:
22952299
for ks in [ (3,3), (4,4), (2,3), (3,2), (1,1) ]:
2296-
test_run_convolution_dilated_impulse_response(dil=dil, kernel_shape=ks)
2300+
run_convolution_dilated_impulse_response(dil=dil, kernel_shape=ks, tol=tol)
22972301
# 3D
2302+
# On Ampere, autotuning might select a TensorCore conv engine, which effectively
2303+
# does a cast to fp16 of the weights and data. Expand tol in these 3D cases.
2304+
tol3D = 1e-2 if effective_dtype(mx.nd.array([1.,])) == np.float16 else tol
22982305
for dil in [ (1,1,1), (2,2,2), (3,3,3) ]:
22992306
for ks in [ (3,3,3), (4,4,4), (2,3,4), (3,2,4), (1,1,1) ]:
2300-
test_run_convolution_dilated_impulse_response(dil=dil, kernel_shape=ks)
2307+
run_convolution_dilated_impulse_response(dil=dil, kernel_shape=ks, tol=tol3D)
23012308

23022309

23032310
@pytest.mark.serial

0 commit comments

Comments
 (0)