Skip to content

Commit 60d4dac

Browse files
authored
Port multi GPU changes to GitHub (#3027)
Signed-off-by: Dom Brown <3886319+DomBrown@users.noreply.github.com>
1 parent 047f2b2 commit 60d4dac

File tree

4 files changed

+308
-171
lines changed

4 files changed

+308
-171
lines changed

cpp/tests/executor/executorTest.cpp

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4117,10 +4117,16 @@ TEST_P(TimeoutTest, TimeoutStreamingTest)
41174117
}
41184118
else
41194119
{
4120+
if (val != NULL && !isMultiGpu)
4121+
{
4122+
GTEST_SKIP() << "Skipping SingleGpu tests";
4123+
}
4124+
41204125
if (!isMultiGpu && !useOrchestratorMode)
41214126
{
41224127
GTEST_SKIP() << "Leader mode on single GPU crashes";
41234128
}
4129+
41244130
// Check that it was launched with right number of MPI ranks
41254131
if (!useOrchestratorMode && COMM_SESSION.getSize() != 4)
41264132
{
@@ -4324,10 +4330,16 @@ TEST_P(TimeoutTest, TimeoutNonstreamingTest)
43244330
}
43254331
else
43264332
{
4333+
if (val != NULL && !isMultiGpu)
4334+
{
4335+
GTEST_SKIP() << "Skipping SingleGpu tests";
4336+
}
4337+
43274338
if (!isMultiGpu && !useOrchestratorMode)
43284339
{
43294340
GTEST_SKIP() << "Leader mode on single GPU crashes";
43304341
}
4342+
43314343
// Check that it was launched with right number of MPI ranks
43324344
if (!useOrchestratorMode && COMM_SESSION.getSize() != 4)
43334345
{

tests/integration/defs/cpp_common.py

Lines changed: 81 additions & 69 deletions
Original file line numberDiff line numberDiff line change
@@ -5,8 +5,14 @@
55
import os as _os
66
import pathlib as _pl
77
import subprocess as _sp
8+
import sys as _sys
89
from typing import Generator, List, Optional, Sequence
910

11+
build_script_dir = _pl.Path(
12+
__file__).parent.resolve().parent.parent.parent / "scripts"
13+
assert build_script_dir.is_dir()
14+
_sys.path.append(str(build_script_dir))
15+
1016
from build_wheel import get_build_dir as get_trt_llm_build_dir
1117

1218
default_test_parallel = 2
@@ -237,8 +243,7 @@ def produce_mpirun_command(*, global_commands, nranks, local_commands,
237243
return l[:-1]
238244

239245

240-
def run_multi_gpu_tests(build_dir: _pl.Path, timeout=1500):
241-
246+
def run_simple_multi_gpu_tests(build_dir: _pl.Path, timeout=1500):
242247
tests_dir = build_dir / "tests"
243248
cpp_env = {**_os.environ}
244249
# Utils tests
@@ -278,17 +283,27 @@ def run_multi_gpu_tests(build_dir: _pl.Path, timeout=1500):
278283
env=new_env,
279284
timeout=600)
280285

281-
xml_output_file = build_dir / "results-multi-gpu-real-decoder.xml"
282-
trt_model_test = produce_mpirun_command(
283-
global_commands=["mpirun", "--allow-run-as-root"],
284-
nranks=4,
285-
local_commands=[
286-
"batch_manager/trtGptModelRealDecoderTest",
287-
"--gtest_filter=*TP*:*PP*"
288-
],
289-
leader_commands=[f"--gtest_output=xml:{xml_output_file}"])
290-
run_command(trt_model_test, cwd=tests_dir, env=cpp_env,
291-
timeout=timeout) # expecting ~ 1200s
286+
# UCX transceiver tests, the test may not be built if ENABLE_UCX is 0
287+
if _os.path.exists(
288+
_os.path.join(tests_dir, "batch_manager/ucxDataTransceiverTest")):
289+
ucx_env = copy.copy(cpp_env)
290+
ucx_env["UCX_MEMTYPE_CACHE"] = "n"
291+
ucx_trans_test = [
292+
"mpirun",
293+
"-n",
294+
"2",
295+
"--allow-run-as-root",
296+
"batch_manager/ucxDataTransceiverTest",
297+
]
298+
run_command(ucx_trans_test, cwd=tests_dir, env=ucx_env, timeout=300)
299+
else:
300+
_logger.info(
301+
"batch_manager/ucxDataTransceiverTest not found, so skipping.")
302+
303+
304+
def run_llama_executor_multi_gpu_tests(build_dir: _pl.Path, timeout=1500):
305+
tests_dir = build_dir / "tests"
306+
cpp_env = {**_os.environ}
292307

293308
mgpu_env = copy.copy(cpp_env)
294309
mgpu_env["RUN_LLAMA_MULTI_GPU"] = "true"
@@ -316,19 +331,6 @@ def run_multi_gpu_tests(build_dir: _pl.Path, timeout=1500):
316331
]
317332
run_command(trt_model_test, cwd=tests_dir, env=mgpu_env, timeout=1500)
318333

319-
#EncDec test in leader mode
320-
xml_output_file = build_dir / "results-multi-gpu-t5-exec-leader-mode.xml"
321-
trt_model_test = produce_mpirun_command(
322-
global_commands=["mpirun", "--allow-run-as-root"],
323-
nranks=4,
324-
local_commands=[
325-
"executor/encDecTest",
326-
"--gtest_filter=T5MultiGPUTest/EncDecParamsTest.Forward*"
327-
],
328-
leader_commands=[f"--gtest_output=xml:{xml_output_file}"],
329-
)
330-
run_command(trt_model_test, cwd=tests_dir, env=cpp_env, timeout=1500)
331-
332334
#Logits processor and guided decoding test in leader mode
333335
xml_output_file = build_dir / "results-multi-gpu-logits-proc.xml"
334336
tp_pp_sizes = [(4, 1), (2, 2), (1, 4)]
@@ -350,24 +352,44 @@ def run_multi_gpu_tests(build_dir: _pl.Path, timeout=1500):
350352
leader_commands=[f"--gtest_output=xml:{xml_output_file}"])
351353
run_command(trt_model_test, cwd=tests_dir, env=mgpu_env, timeout=1500)
352354

353-
# UCX transceiver tests, the test may not be built if ENABLE_UCX is 0
354-
if _os.path.exists(
355-
_os.path.join(tests_dir, "batch_manager/ucxDataTransceiverTest")):
356-
ucx_env = copy.copy(cpp_env)
357-
ucx_env["UCX_MEMTYPE_CACHE"] = "n"
358-
ucx_trans_test = [
359-
"mpirun",
360-
"-n",
361-
"2",
362-
"--allow-run-as-root",
363-
"batch_manager/ucxDataTransceiverTest",
364-
]
365-
run_command(ucx_trans_test, cwd=tests_dir, env=ucx_env, timeout=300)
366355

367-
run_disagg_tests(build_dir)
356+
def run_t5_multi_gpu_tests(build_dir: _pl.Path, timeout=1500):
357+
tests_dir = build_dir / "tests"
358+
cpp_env = {**_os.environ}
368359

360+
#EncDec test in leader mode
361+
xml_output_file = build_dir / "results-multi-gpu-t5-exec-leader-mode.xml"
362+
trt_model_test = produce_mpirun_command(
363+
global_commands=["mpirun", "--allow-run-as-root"],
364+
nranks=4,
365+
local_commands=[
366+
"executor/encDecTest",
367+
"--gtest_filter=T5MultiGPUTest/EncDecParamsTest.Forward*"
368+
],
369+
leader_commands=[f"--gtest_output=xml:{xml_output_file}"],
370+
)
371+
run_command(trt_model_test, cwd=tests_dir, env=cpp_env, timeout=1500)
372+
373+
374+
def run_trt_gpt_model_real_decoder_multi_gpu_tests(build_dir: _pl.Path,
375+
timeout=1500):
376+
tests_dir = build_dir / "tests"
377+
cpp_env = {**_os.environ}
378+
379+
xml_output_file = build_dir / "results-multi-gpu-real-decoder.xml"
380+
trt_model_test = produce_mpirun_command(
381+
global_commands=["mpirun", "--allow-run-as-root"],
382+
nranks=4,
383+
local_commands=[
384+
"batch_manager/trtGptModelRealDecoderTest",
385+
"--gtest_filter=*TP*:*PP*"
386+
],
387+
leader_commands=[f"--gtest_output=xml:{xml_output_file}"])
388+
run_command(trt_model_test, cwd=tests_dir, env=cpp_env,
389+
timeout=timeout) # expecting ~ 1200s
369390

370-
def run_disagg_tests(build_dir: _pl.Path):
391+
392+
def run_disagg_multi_gpu_tests(build_dir: _pl.Path):
371393

372394
tests_dir = build_dir / "tests"
373395
cpp_env = {**_os.environ}
@@ -549,38 +571,28 @@ def prepare_model_tests(model_name: str,
549571
timeout=600)
550572

551573

552-
def prepare_multi_gpu_model_tests(python_exe: str,
574+
def prepare_multi_gpu_model_tests(test_list: List[str],
575+
python_exe: str,
553576
root_dir: _pl.Path,
554577
resources_dir: _pl.Path,
555578
model_cache: Optional[str] = None):
556579
model_cache_arg = ["--model_cache", model_cache] if model_cache else []
557-
only_multi_gpu_arg = ["--only_multi_gpu"]
558-
559-
prepare_model_tests(model_name="llama",
560-
python_exe=python_exe,
561-
root_dir=root_dir,
562-
resources_dir=resources_dir,
563-
model_cache_arg=model_cache_arg,
564-
only_multi_gpu_arg=only_multi_gpu_arg)
565-
566-
prepare_model_tests(model_name="llama",
567-
python_exe=python_exe,
568-
root_dir=root_dir,
569-
resources_dir=resources_dir,
570-
model_cache_arg=model_cache_arg)
571-
572-
prepare_model_tests(model_name="t5",
573-
python_exe=python_exe,
574-
root_dir=root_dir,
575-
resources_dir=resources_dir,
576-
model_cache_arg=model_cache_arg,
577-
only_multi_gpu_arg=['--tp', '4', '--pp', '1'])
578-
579-
prepare_model_tests(model_name="gpt",
580-
python_exe=python_exe,
581-
root_dir=root_dir,
582-
resources_dir=resources_dir,
583-
model_cache_arg=model_cache_arg)
580+
581+
if "llama" in test_list:
582+
prepare_model_tests(model_name="llama",
583+
python_exe=python_exe,
584+
root_dir=root_dir,
585+
resources_dir=resources_dir,
586+
model_cache_arg=model_cache_arg,
587+
only_multi_gpu_arg=["--only_multi_gpu"])
588+
589+
if "t5" in test_list:
590+
prepare_model_tests(model_name="t5",
591+
python_exe=python_exe,
592+
root_dir=root_dir,
593+
resources_dir=resources_dir,
594+
model_cache_arg=model_cache_arg,
595+
only_multi_gpu_arg=['--tp', '4', '--pp', '1'])
584596

585597

586598
def run_single_gpu_tests(build_dir: _pl.Path,

0 commit comments

Comments
 (0)