Skip to content

Commit 154f7cc

Browse files
authored
fix a bug of global cuda graph dummy request (NVIDIA#4894)
Signed-off-by: QI JUN <[email protected]>
1 parent 7e921c7 commit 154f7cc

File tree

4 files changed

+39
-0
lines changed

4 files changed

+39
-0
lines changed

examples/pytorch/quickstart_advanced.py

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -78,6 +78,13 @@ def add_llm_args(parser):
7878
default=False,
7979
action='store_true')
8080
parser.add_argument('--use_cuda_graph', default=False, action='store_true')
81+
parser.add_argument('--cuda_graph_padding_enabled',
82+
default=False,
83+
action='store_true')
84+
parser.add_argument('--cuda_graph_batch_sizes',
85+
nargs='+',
86+
type=int,
87+
default=[])
8188
parser.add_argument('--print_iter_log',
8289
default=False,
8390
action='store_true',
@@ -160,6 +167,8 @@ def setup_llm(args):
160167
kv_cache_config=kv_cache_config,
161168
attn_backend=args.attention_backend,
162169
use_cuda_graph=args.use_cuda_graph,
170+
cuda_graph_padding_enabled=args.cuda_graph_padding_enabled,
171+
cuda_graph_batch_sizes=args.cuda_graph_batch_sizes,
163172
load_format=args.load_format,
164173
print_iter_log=args.print_iter_log,
165174
enable_iter_perf_stats=args.print_iter_log,

tensorrt_llm/_torch/pyexecutor/model_engine.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -478,6 +478,10 @@ def warmup(self, resource_manager: ResourceManager) -> None:
478478
logger.info("Skipping warm up as no KV Cache manager allocated.")
479479
return
480480

481+
# The lifetime of model engine and kv cache manager can be different.
482+
# Reset the global cuda graph dummy request to None in warmup.
483+
self.cuda_graph_dummy_request = None
484+
481485
def get_cuda_graph_warmup_request(batch_size):
482486
available_blocks = kv_cache_manager.get_num_free_blocks()
483487
if available_blocks >= batch_size:

tests/integration/defs/test_e2e.py

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1569,6 +1569,31 @@ def test_ptq_quickstart_advanced_mtp(llm_root, llm_venv, model_name,
15691569
_check_mem_usage(running_log, [54.50, 0, 0, 0])
15701570

15711571

1572+
@pytest.mark.skip_less_device(4)
1573+
def test_ptq_quickstart_advanced_bs1(llm_root, llm_venv):
1574+
model_name = "DeepSeek-V3-Lite-FP8"
1575+
model_path = "DeepSeek-V3-Lite/fp8"
1576+
print(f"Testing {model_name}.")
1577+
example_root = Path(os.path.join(llm_root, "examples", "pytorch"))
1578+
llm_venv.run_cmd([
1579+
str(example_root / "quickstart_advanced.py"),
1580+
"--use_cuda_graph",
1581+
"--cuda_graph_padding_enabled",
1582+
"--cuda_graph_batch_sizes",
1583+
"8",
1584+
"--disable_overlap_scheduler",
1585+
"--enable_attention_dp",
1586+
"--tp_size",
1587+
"4",
1588+
"--moe_ep_size",
1589+
"4",
1590+
"--prompt",
1591+
"\"NVIDIA is a great company because\"",
1592+
"--model_dir",
1593+
f"{llm_models_root()}/{model_path}",
1594+
])
1595+
1596+
15721597
@pytest.mark.skip_less_device_memory(80000)
15731598
@pytest.mark.skip_less_device(8)
15741599
@skip_pre_hopper

tests/integration/test_lists/test-db/l0_dgx_h100.yml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -53,6 +53,7 @@ l0_dgx_h100:
5353
auto_trigger: deepseek
5454
tests:
5555
- unittest/_torch/multi_gpu_modeling -k "deepseek"
56+
- test_e2e.py::test_ptq_quickstart_advanced_bs1
5657
- accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_fp8_block_scales_4gpus[tp4-mtp_nextn=0-fp8kv=False-attention_dp=False-cuda_graph=False-overlap_scheduler=False-torch_compile=False]
5758
- accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_fp8_block_scales_4gpus[tp4-mtp_nextn=0-fp8kv=False-attention_dp=False-cuda_graph=False-overlap_scheduler=False-torch_compile=True]
5859
- accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_fp8_block_scales_4gpus[tp4-mtp_nextn=0-fp8kv=True-attention_dp=False-cuda_graph=False-overlap_scheduler=False-torch_compile=False]

0 commit comments

Comments
 (0)