NVIDIA · wujingyue · Jan 6, 2026 · Jan 5, 2026 · Jan 5, 2026 · Jan 5, 2026
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -1237,8 +1237,8 @@ if(BUILD_TEST)
     ${NVFUSER_ROOT}/tests/cpp/multidevice.cpp
     ${NVFUSER_ROOT}/tests/cpp/test_multidevice_tutorial.cpp
   )
-  add_test_without_main(tutorial_multidevice "${MULTIDEVICE_TUTORIAL_SRCS}" "")
-  list(APPEND TEST_BINARIES tutorial_multidevice)
+  add_test_without_main(test_multidevice_tutorial "${MULTIDEVICE_TUTORIAL_SRCS}" "")
+  list(APPEND TEST_BINARIES test_multidevice_tutorial)
 
   add_test(test_reshape "${NVFUSER_ROOT}/tests/cpp/test_reshape.cpp" "")
   list(APPEND TEST_BINARIES test_reshape)

diff --git a/manual_ci.sh b/manual_ci.sh
@@ -68,7 +68,7 @@ fi
 echo ""
 echo "Running C++ Binary Tests..."
 # Tests that require MPI
-MPI_TESTS=("test_multidevice" "tutorial_multidevice")
+MPI_TESTS=("test_multidevice" "test_multidevice_tutorial")
 
 # Find all test_* and tutorial_* binaries in bin/
 if [ -d "./bin" ]; then

diff --git a/tests/cpp/test_multidevice_tutorial.cpp b/tests/cpp/test_multidevice_tutorial.cpp
@@ -31,7 +31,7 @@ class MultiDeviceTutorial : public MultiDeviceTest {
       GTEST_SKIP() << "Distributed setting not available. "
                    << "Make sure you are on a node with n>1 GPUs and run "
                    << "`mpirun -np n -x NVFUSER_TUTORIAL_VERBOSE=1 "
-                      "tutorial_multidevice`";
+                      "test_multidevice_tutorial`";
     }
   }
 
@@ -43,7 +43,7 @@ bool MultiDeviceTutorial::verbose_ = false;
 
 // To run those tests, allocate a node with n>1 GPUs and run:
 //
-// mpirun -np n -x NVFUSER_TUTORIAL_VERBOSE=1 tutorial_multidevice
+// mpirun -np n -x NVFUSER_TUTORIAL_VERBOSE=1 test_multidevice_tutorial
 //
 // We use a SPMD paradigm, where each host process manages one and only device,
 // and each device executes the same program. Therefore, the number of process
@@ -311,6 +311,7 @@ TEST_F(MultiDeviceTutorial, SimplePipelining) {
   // device 1. This implies that a network communication needs to be executed.
   // More precisely, to produce tv2, we need device 0 to send tv1 to device 1.
 
+  SKIP_IF_NOT_ENOUGH_DEVICES(fusion);
   MultiDeviceExecutor multidevice_executor(std::move(fusion), *communicator_);
   if (verbose_ && communicator_->deviceId() < 2) {
     std::cout << "Device ID = " << communicator_->deviceId() << std::endl;
@@ -992,7 +993,7 @@ TEST_F(MultiDeviceTutorial, HostIrGemmReduceScatter) {
   |   tv2[i,...] = Fusion1 (tv1_i)
 */
 // To do so, we will be using new Host IRs: Stream (a Val), SetStream, ForLoop.
-TEST_F(MultiDeviceTutorial, HostIrKernekPipelining) {
+TEST_F(MultiDeviceTutorial, DISABLED_HostIrKernelPipelining) {
   constexpr int64_t kNDims = 2;
   constexpr int64_t kPipelineAxis = 0;
   constexpr int64_t kNumberOfStreams = 4;