Skip to content

Commit 30e3a31

Browse files
liqiangxlwujingyuegreptile-apps[bot]
authored
rename to test_multidevice_tutorial (#5756)
**Two minor change:** 1. Rename from `tutorial_multidevice` to `test_multidevice_tutorial` 2. Noticed 2 test failures in a local node with 1 gpu. Revised to skip these two tests if there is only 1 gpu. Do we know why CI didn’t catch this issue? I’m wondering if it might be related to CI consistently running this test on nodes with more than one GPU. @xwang233 **After revision:** (in a node with 1 gpu) ``` [ SKIPPED ] 2 tests, listed below: [ SKIPPED ] MultiDeviceTutorial.SimplePipelining [ SKIPPED ] MultiDeviceTutorial.HostIrKernekPipelining ``` **Original errs:** ``` [ RUN ] MultiDeviceTutorial.SimplePipelining unknown file: Failure C++ exception with description "Expected (requested_n_gpus)<=(communicator_->size()) . Found 2 vs 1. Exception raised from validate at /opt/pytorch/nvfuser/csrc/host_ir/evaluator.cpp:134 (most recent call first): frame #0: nvfuser::nvfCheckFail(char const*, char const*, long, std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > const&) + 0x110 (0xbbbd9d5e8530 in ./test_tutorial_multidevice) frame #1: <unknown function> + 0x6e8e58 (0xbbbd9d988e58 in ./test_tutorial_multidevice) frame #2: <unknown function> + 0x6ea55c (0xbbbd9d98a55c in ./test_tutorial_multidevice) frame #3: <unknown function> + 0x971fb8 (0xbbbd9dc11fb8 in ./test_tutorial_multidevice) frame #4: <unknown function> + 0xddcb84 (0xbbbd9e07cb84 in ./test_tutorial_multidevice) frame #5: <unknown function> + 0xe2f6a0 (0xbbbd9e0cf6a0 in ./test_tutorial_multidevice) frame #6: <unknown function> + 0xe15a94 (0xbbbd9e0b5a94 in ./test_tutorial_multidevice) frame #7: <unknown function> + 0xe15f88 (0xbbbd9e0b5f88 in ./test_tutorial_multidevice) frame #8: <unknown function> + 0xe16584 (0xbbbd9e0b6584 in ./test_tutorial_multidevice) frame #9: <unknown function> + 0xe23830 (0xbbbd9e0c3830 in ./test_tutorial_multidevice) frame #10: <unknown function> + 0xe16760 (0xbbbd9e0b6760 in ./test_tutorial_multidevice) frame #11: <unknown function> + 0x351104 (0xbbbd9d5f1104 in ./test_tutorial_multidevice) frame #12: <unknown function> + 0x284c4 (0xfc5a4eb684c4 in /usr/lib/aarch64-linux-gnu/libc.so.6) frame #13: __libc_start_main + 0x98 (0xfc5a4eb68598 in /usr/lib/aarch64-linux-gnu/libc.so.6) frame #14: <unknown function> + 0x36bd70 (0xbbbd9d60bd70 in ./test_tutorial_multidevice) " thrown in the test body. To reproduce: NVFUSER_TEST_RANDOM_SEED=1767623470 NVFUSER_TEST_ATEN_RANDOM_SEED=0 test_nvfuser --gtest_filter='MultiDeviceTutorial.SimplePipelining' [ FAILED ] MultiDeviceTutorial.SimplePipelining (0 ms) ``` ``` [ RUN ] MultiDeviceTutorial.HostIrKernekPipelining [gb-nvl-118-compute03:111994:0:111994] Caught signal 11 (Segmentation fault: address not mapped to object at address 0x8) ==== backtrace (tid: 111994) ==== 0 /opt/hpcx/ucx/lib/libucs.so.0(ucs_handle_error+0x2cc) [0xfc5a14ff19fc] 1 /opt/hpcx/ucx/lib/libucs.so.0(+0x31bac) [0xfc5a14ff1bac] 2 /opt/hpcx/ucx/lib/libucs.so.0(+0x31ed8) [0xfc5a14ff1ed8] 3 linux-vdso.so.1(__kernel_rt_sigreturn+0) [0xfc5a7edc0968] 4 ./test_tutorial_multidevice(+0x6f38b4) [0xbbbd9d9938b4] 5 ./test_tutorial_multidevice(+0xde1ab4) [0xbbbd9e081ab4] 6 ./test_tutorial_multidevice(+0xe2f6a0) [0xbbbd9e0cf6a0] 7 ./test_tutorial_multidevice(+0xe15a94) [0xbbbd9e0b5a94] 8 ./test_tutorial_multidevice(+0xe15f88) [0xbbbd9e0b5f88] 9 ./test_tutorial_multidevice(+0xe16584) [0xbbbd9e0b6584] 10 ./test_tutorial_multidevice(+0xe23830) [0xbbbd9e0c3830] 11 ./test_tutorial_multidevice(+0xe16760) [0xbbbd9e0b6760] 12 ./test_tutorial_multidevice(+0x351104) [0xbbbd9d5f1104] 13 /usr/lib/aarch64-linux-gnu/libc.so.6(+0x284c4) [0xfc5a4eb684c4] 14 /usr/lib/aarch64-linux-gnu/libc.so.6(__libc_start_main+0x98) [0xfc5a4eb68598] 15 ./test_tutorial_multidevice(+0x36bd70) [0xbbbd9d60bd70] ``` --------- Co-authored-by: Jingyue Wu <[email protected]> Co-authored-by: greptile-apps[bot] <165735046+greptile-apps[bot]@users.noreply.github.com>
1 parent 1bbd375 commit 30e3a31

File tree

3 files changed

+7
-6
lines changed

3 files changed

+7
-6
lines changed

CMakeLists.txt

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1237,8 +1237,8 @@ if(BUILD_TEST)
12371237
${NVFUSER_ROOT}/tests/cpp/multidevice.cpp
12381238
${NVFUSER_ROOT}/tests/cpp/test_multidevice_tutorial.cpp
12391239
)
1240-
add_test_without_main(tutorial_multidevice "${MULTIDEVICE_TUTORIAL_SRCS}" "")
1241-
list(APPEND TEST_BINARIES tutorial_multidevice)
1240+
add_test_without_main(test_multidevice_tutorial "${MULTIDEVICE_TUTORIAL_SRCS}" "")
1241+
list(APPEND TEST_BINARIES test_multidevice_tutorial)
12421242

12431243
add_test(test_reshape "${NVFUSER_ROOT}/tests/cpp/test_reshape.cpp" "")
12441244
list(APPEND TEST_BINARIES test_reshape)

manual_ci.sh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -68,7 +68,7 @@ fi
6868
echo ""
6969
echo "Running C++ Binary Tests..."
7070
# Tests that require MPI
71-
MPI_TESTS=("test_multidevice" "tutorial_multidevice")
71+
MPI_TESTS=("test_multidevice" "test_multidevice_tutorial")
7272

7373
# Find all test_* and tutorial_* binaries in bin/
7474
if [ -d "./bin" ]; then

tests/cpp/test_multidevice_tutorial.cpp

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -31,7 +31,7 @@ class MultiDeviceTutorial : public MultiDeviceTest {
3131
GTEST_SKIP() << "Distributed setting not available. "
3232
<< "Make sure you are on a node with n>1 GPUs and run "
3333
<< "`mpirun -np n -x NVFUSER_TUTORIAL_VERBOSE=1 "
34-
"tutorial_multidevice`";
34+
"test_multidevice_tutorial`";
3535
}
3636
}
3737

@@ -43,7 +43,7 @@ bool MultiDeviceTutorial::verbose_ = false;
4343

4444
// To run those tests, allocate a node with n>1 GPUs and run:
4545
//
46-
// mpirun -np n -x NVFUSER_TUTORIAL_VERBOSE=1 tutorial_multidevice
46+
// mpirun -np n -x NVFUSER_TUTORIAL_VERBOSE=1 test_multidevice_tutorial
4747
//
4848
// We use a SPMD paradigm, where each host process manages one and only device,
4949
// and each device executes the same program. Therefore, the number of process
@@ -311,6 +311,7 @@ TEST_F(MultiDeviceTutorial, SimplePipelining) {
311311
// device 1. This implies that a network communication needs to be executed.
312312
// More precisely, to produce tv2, we need device 0 to send tv1 to device 1.
313313

314+
SKIP_IF_NOT_ENOUGH_DEVICES(fusion);
314315
MultiDeviceExecutor multidevice_executor(std::move(fusion), *communicator_);
315316
if (verbose_ && communicator_->deviceId() < 2) {
316317
std::cout << "Device ID = " << communicator_->deviceId() << std::endl;
@@ -992,7 +993,7 @@ TEST_F(MultiDeviceTutorial, HostIrGemmReduceScatter) {
992993
| tv2[i,...] = Fusion1 (tv1_i)
993994
*/
994995
// To do so, we will be using new Host IRs: Stream (a Val), SetStream, ForLoop.
995-
TEST_F(MultiDeviceTutorial, HostIrKernekPipelining) {
996+
TEST_F(MultiDeviceTutorial, DISABLED_HostIrKernelPipelining) {
996997
constexpr int64_t kNDims = 2;
997998
constexpr int64_t kPipelineAxis = 0;
998999
constexpr int64_t kNumberOfStreams = 4;

0 commit comments

Comments
 (0)