NVIDIA
diff --git a/‎cpp/tests/executor/executorTest.cpp‎
Lines changed: 12 additions & 0 deletions b/‎cpp/tests/executor/executorTest.cpp‎
Lines changed: 12 additions & 0 deletions
diff --git a/‎tests/integration/defs/cpp_common.py‎
Lines changed: 81 additions & 69 deletions b/‎tests/integration/defs/cpp_common.py‎
Lines changed: 81 additions & 69 deletions
@@ -4117,10 +4117,16 @@ TEST_P(TimeoutTest, TimeoutStreamingTest)
     }
     else
     {
+        if (val != NULL && !isMultiGpu)
+        {
+            GTEST_SKIP() << "Skipping SingleGpu tests";
+        }
+
         if (!isMultiGpu && !useOrchestratorMode)
         {
             GTEST_SKIP() << "Leader mode on single GPU crashes";
         }
+
         // Check that it was launched with right number of MPI ranks
         if (!useOrchestratorMode && COMM_SESSION.getSize() != 4)
         {
@@ -4324,10 +4330,16 @@ TEST_P(TimeoutTest, TimeoutNonstreamingTest)
     }
     else
     {
+        if (val != NULL && !isMultiGpu)
+        {
+            GTEST_SKIP() << "Skipping SingleGpu tests";
+        }
+
         if (!isMultiGpu && !useOrchestratorMode)
         {
             GTEST_SKIP() << "Leader mode on single GPU crashes";
         }
+
         // Check that it was launched with right number of MPI ranks
         if (!useOrchestratorMode && COMM_SESSION.getSize() != 4)
         {
 
@@ -5,8 +5,14 @@
 import os as _os
 import pathlib as _pl
 import subprocess as _sp
+import sys as _sys
 from typing import Generator, List, Optional, Sequence
 
+build_script_dir = _pl.Path(
+    __file__).parent.resolve().parent.parent.parent / "scripts"
+assert build_script_dir.is_dir()
+_sys.path.append(str(build_script_dir))
+
 from build_wheel import get_build_dir as get_trt_llm_build_dir
 
 default_test_parallel = 2
@@ -237,8 +243,7 @@ def produce_mpirun_command(*, global_commands, nranks, local_commands,
     return l[:-1]
 
 
-def run_multi_gpu_tests(build_dir: _pl.Path, timeout=1500):
-
+def run_simple_multi_gpu_tests(build_dir: _pl.Path, timeout=1500):
     tests_dir = build_dir / "tests"
     cpp_env = {**_os.environ}
     # Utils tests
@@ -278,17 +283,27 @@ def run_multi_gpu_tests(build_dir: _pl.Path, timeout=1500):
                 env=new_env,
                 timeout=600)
 
-    xml_output_file = build_dir / "results-multi-gpu-real-decoder.xml"
-    trt_model_test = produce_mpirun_command(
-        global_commands=["mpirun", "--allow-run-as-root"],
-        nranks=4,
-        local_commands=[
-            "batch_manager/trtGptModelRealDecoderTest",
-            "--gtest_filter=*TP*:*PP*"
-        ],
-        leader_commands=[f"--gtest_output=xml:{xml_output_file}"])
-    run_command(trt_model_test, cwd=tests_dir, env=cpp_env,
-                timeout=timeout)  # expecting ~ 1200s
+    # UCX transceiver tests, the test may not be built if ENABLE_UCX is 0
+    if _os.path.exists(
+            _os.path.join(tests_dir, "batch_manager/ucxDataTransceiverTest")):
+        ucx_env = copy.copy(cpp_env)
+        ucx_env["UCX_MEMTYPE_CACHE"] = "n"
+        ucx_trans_test = [
+            "mpirun",
+            "-n",
+            "2",
+            "--allow-run-as-root",
+            "batch_manager/ucxDataTransceiverTest",
+        ]
+        run_command(ucx_trans_test, cwd=tests_dir, env=ucx_env, timeout=300)
+    else:
+        _logger.info(
+            "batch_manager/ucxDataTransceiverTest not found, so skipping.")
+
+
+def run_llama_executor_multi_gpu_tests(build_dir: _pl.Path, timeout=1500):
+    tests_dir = build_dir / "tests"
+    cpp_env = {**_os.environ}
 
     mgpu_env = copy.copy(cpp_env)
     mgpu_env["RUN_LLAMA_MULTI_GPU"] = "true"
@@ -316,19 +331,6 @@ def run_multi_gpu_tests(build_dir: _pl.Path, timeout=1500):
     ]
     run_command(trt_model_test, cwd=tests_dir, env=mgpu_env, timeout=1500)
 
-    #EncDec test in leader mode
-    xml_output_file = build_dir / "results-multi-gpu-t5-exec-leader-mode.xml"
-    trt_model_test = produce_mpirun_command(
-        global_commands=["mpirun", "--allow-run-as-root"],
-        nranks=4,
-        local_commands=[
-            "executor/encDecTest",
-            "--gtest_filter=T5MultiGPUTest/EncDecParamsTest.Forward*"
-        ],
-        leader_commands=[f"--gtest_output=xml:{xml_output_file}"],
-    )
-    run_command(trt_model_test, cwd=tests_dir, env=cpp_env, timeout=1500)
-
     #Logits processor and guided decoding test in leader mode
     xml_output_file = build_dir / "results-multi-gpu-logits-proc.xml"
     tp_pp_sizes = [(4, 1), (2, 2), (1, 4)]
@@ -350,24 +352,44 @@ def run_multi_gpu_tests(build_dir: _pl.Path, timeout=1500):
         leader_commands=[f"--gtest_output=xml:{xml_output_file}"])
     run_command(trt_model_test, cwd=tests_dir, env=mgpu_env, timeout=1500)
 
-    # UCX transceiver tests, the test may not be built if ENABLE_UCX is 0
-    if _os.path.exists(
-            _os.path.join(tests_dir, "batch_manager/ucxDataTransceiverTest")):
-        ucx_env = copy.copy(cpp_env)
-        ucx_env["UCX_MEMTYPE_CACHE"] = "n"
-        ucx_trans_test = [
-            "mpirun",
-            "-n",
-            "2",
-            "--allow-run-as-root",
-            "batch_manager/ucxDataTransceiverTest",
-        ]
-        run_command(ucx_trans_test, cwd=tests_dir, env=ucx_env, timeout=300)
 
-    run_disagg_tests(build_dir)
+def run_t5_multi_gpu_tests(build_dir: _pl.Path, timeout=1500):
+    tests_dir = build_dir / "tests"
+    cpp_env = {**_os.environ}
 
+    #EncDec test in leader mode
+    xml_output_file = build_dir / "results-multi-gpu-t5-exec-leader-mode.xml"
+    trt_model_test = produce_mpirun_command(
+        global_commands=["mpirun", "--allow-run-as-root"],
+        nranks=4,
+        local_commands=[
+            "executor/encDecTest",
+            "--gtest_filter=T5MultiGPUTest/EncDecParamsTest.Forward*"
+        ],
+        leader_commands=[f"--gtest_output=xml:{xml_output_file}"],
+    )
+    run_command(trt_model_test, cwd=tests_dir, env=cpp_env, timeout=1500)
+
+
+def run_trt_gpt_model_real_decoder_multi_gpu_tests(build_dir: _pl.Path,
+                                                   timeout=1500):
+    tests_dir = build_dir / "tests"
+    cpp_env = {**_os.environ}
+
+    xml_output_file = build_dir / "results-multi-gpu-real-decoder.xml"
+    trt_model_test = produce_mpirun_command(
+        global_commands=["mpirun", "--allow-run-as-root"],
+        nranks=4,
+        local_commands=[
+            "batch_manager/trtGptModelRealDecoderTest",
+            "--gtest_filter=*TP*:*PP*"
+        ],
+        leader_commands=[f"--gtest_output=xml:{xml_output_file}"])
+    run_command(trt_model_test, cwd=tests_dir, env=cpp_env,
+                timeout=timeout)  # expecting ~ 1200s
 
-def run_disagg_tests(build_dir: _pl.Path):
+
+def run_disagg_multi_gpu_tests(build_dir: _pl.Path):
 
     tests_dir = build_dir / "tests"
     cpp_env = {**_os.environ}
@@ -549,38 +571,28 @@ def prepare_model_tests(model_name: str,
                     timeout=600)
 
 
-def prepare_multi_gpu_model_tests(python_exe: str,
+def prepare_multi_gpu_model_tests(test_list: List[str],
+                                  python_exe: str,
                                   root_dir: _pl.Path,
                                   resources_dir: _pl.Path,
                                   model_cache: Optional[str] = None):
     model_cache_arg = ["--model_cache", model_cache] if model_cache else []
-    only_multi_gpu_arg = ["--only_multi_gpu"]
-
-    prepare_model_tests(model_name="llama",
-                        python_exe=python_exe,
-                        root_dir=root_dir,
-                        resources_dir=resources_dir,
-                        model_cache_arg=model_cache_arg,
-                        only_multi_gpu_arg=only_multi_gpu_arg)
-
-    prepare_model_tests(model_name="llama",
-                        python_exe=python_exe,
-                        root_dir=root_dir,
-                        resources_dir=resources_dir,
-                        model_cache_arg=model_cache_arg)
-
-    prepare_model_tests(model_name="t5",
-                        python_exe=python_exe,
-                        root_dir=root_dir,
-                        resources_dir=resources_dir,
-                        model_cache_arg=model_cache_arg,
-                        only_multi_gpu_arg=['--tp', '4', '--pp', '1'])
-
-    prepare_model_tests(model_name="gpt",
-                        python_exe=python_exe,
-                        root_dir=root_dir,
-                        resources_dir=resources_dir,
-                        model_cache_arg=model_cache_arg)
+
+    if "llama" in test_list:
+        prepare_model_tests(model_name="llama",
+                            python_exe=python_exe,
+                            root_dir=root_dir,
+                            resources_dir=resources_dir,
+                            model_cache_arg=model_cache_arg,
+                            only_multi_gpu_arg=["--only_multi_gpu"])
+
+    if "t5" in test_list:
+        prepare_model_tests(model_name="t5",
+                            python_exe=python_exe,
+                            root_dir=root_dir,
+                            resources_dir=resources_dir,
+                            model_cache_arg=model_cache_arg,
+                            only_multi_gpu_arg=['--tp', '4', '--pp', '1'])
 
 
 def run_single_gpu_tests(build_dir: _pl.Path,
Original file line number	Diff line number	Diff line change
`@@ -4117,10 +4117,16 @@ TEST_P(TimeoutTest, TimeoutStreamingTest)`
`4117`	`4117`	`}`
`4118`	`4118`	`else`
`4119`	`4119`	`{`
	`4120`	`+ if (val != NULL && !isMultiGpu)`
	`4121`	`+ {`
	`4122`	`+ GTEST_SKIP() << "Skipping SingleGpu tests";`
	`4123`	`+ }`
	`4124`	`+`
`4120`	`4125`	`if (!isMultiGpu && !useOrchestratorMode)`
`4121`	`4126`	`{`
`4122`	`4127`	`GTEST_SKIP() << "Leader mode on single GPU crashes";`
`4123`	`4128`	`}`
	`4129`	`+`
`4124`	`4130`	`// Check that it was launched with right number of MPI ranks`
`4125`	`4131`	`if (!useOrchestratorMode && COMM_SESSION.getSize() != 4)`
`4126`	`4132`	`{`
`@@ -4324,10 +4330,16 @@ TEST_P(TimeoutTest, TimeoutNonstreamingTest)`
`4324`	`4330`	`}`
`4325`	`4331`	`else`
`4326`	`4332`	`{`
	`4333`	`+ if (val != NULL && !isMultiGpu)`
	`4334`	`+ {`
	`4335`	`+ GTEST_SKIP() << "Skipping SingleGpu tests";`
	`4336`	`+ }`
	`4337`	`+`
`4327`	`4338`	`if (!isMultiGpu && !useOrchestratorMode)`
`4328`	`4339`	`{`
`4329`	`4340`	`GTEST_SKIP() << "Leader mode on single GPU crashes";`
`4330`	`4341`	`}`
	`4342`	`+`
`4331`	`4343`	`// Check that it was launched with right number of MPI ranks`
`4332`	`4344`	`if (!useOrchestratorMode && COMM_SESSION.getSize() != 4)`
`4333`	`4345`	`{`