Skip to content

Commit ec50684

Browse files
authored
Revert "fix a bug of global cuda graph dummy request" (NVIDIA#4970)
1 parent 37ac564 commit ec50684

File tree

4 files changed

+0
-39
lines changed

4 files changed

+0
-39
lines changed

examples/pytorch/quickstart_advanced.py

Lines changed: 0 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -78,13 +78,6 @@ def add_llm_args(parser):
7878
default=False,
7979
action='store_true')
8080
parser.add_argument('--use_cuda_graph', default=False, action='store_true')
81-
parser.add_argument('--cuda_graph_padding_enabled',
82-
default=False,
83-
action='store_true')
84-
parser.add_argument('--cuda_graph_batch_sizes',
85-
nargs='+',
86-
type=int,
87-
default=[])
8881
parser.add_argument('--print_iter_log',
8982
default=False,
9083
action='store_true',
@@ -167,8 +160,6 @@ def setup_llm(args):
167160
kv_cache_config=kv_cache_config,
168161
attn_backend=args.attention_backend,
169162
use_cuda_graph=args.use_cuda_graph,
170-
cuda_graph_padding_enabled=args.cuda_graph_padding_enabled,
171-
cuda_graph_batch_sizes=args.cuda_graph_batch_sizes,
172163
load_format=args.load_format,
173164
print_iter_log=args.print_iter_log,
174165
enable_iter_perf_stats=args.print_iter_log,

tensorrt_llm/_torch/pyexecutor/model_engine.py

Lines changed: 0 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -478,10 +478,6 @@ def warmup(self, resource_manager: ResourceManager) -> None:
478478
logger.info("Skipping warm up as no KV Cache manager allocated.")
479479
return
480480

481-
# The lifetime of model engine and kv cache manager can be different.
482-
# Reset the global cuda graph dummy request to None in warmup.
483-
self.cuda_graph_dummy_request = None
484-
485481
def get_cuda_graph_warmup_request(batch_size):
486482
available_blocks = kv_cache_manager.get_num_free_blocks()
487483
if available_blocks >= batch_size:

tests/integration/defs/test_e2e.py

Lines changed: 0 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -1569,31 +1569,6 @@ def test_ptq_quickstart_advanced_mtp(llm_root, llm_venv, model_name,
15691569
_check_mem_usage(running_log, [54.50, 0, 0, 0])
15701570

15711571

1572-
@pytest.mark.skip_less_device(4)
1573-
def test_ptq_quickstart_advanced_bs1(llm_root, llm_venv):
1574-
model_name = "DeepSeek-V3-Lite-FP8"
1575-
model_path = "DeepSeek-V3-Lite/fp8"
1576-
print(f"Testing {model_name}.")
1577-
example_root = Path(os.path.join(llm_root, "examples", "pytorch"))
1578-
llm_venv.run_cmd([
1579-
str(example_root / "quickstart_advanced.py"),
1580-
"--use_cuda_graph",
1581-
"--cuda_graph_padding_enabled",
1582-
"--cuda_graph_batch_sizes",
1583-
"8",
1584-
"--disable_overlap_scheduler",
1585-
"--enable_attention_dp",
1586-
"--tp_size",
1587-
"4",
1588-
"--moe_ep_size",
1589-
"4",
1590-
"--prompt",
1591-
"\"NVIDIA is a great company because\"",
1592-
"--model_dir",
1593-
f"{llm_models_root()}/{model_path}",
1594-
])
1595-
1596-
15971572
@pytest.mark.skip_less_device_memory(80000)
15981573
@pytest.mark.skip_less_device(8)
15991574
@skip_pre_hopper

tests/integration/test_lists/test-db/l0_dgx_h100.yml

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -53,7 +53,6 @@ l0_dgx_h100:
5353
auto_trigger: deepseek
5454
tests:
5555
- unittest/_torch/multi_gpu_modeling -k "deepseek"
56-
- test_e2e.py::test_ptq_quickstart_advanced_bs1
5756
- accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_fp8_block_scales_4gpus[tp4-mtp_nextn=0-fp8kv=False-attention_dp=False-cuda_graph=False-overlap_scheduler=False-torch_compile=False]
5857
- accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_fp8_block_scales_4gpus[tp4-mtp_nextn=0-fp8kv=False-attention_dp=False-cuda_graph=False-overlap_scheduler=False-torch_compile=True]
5958
- accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_fp8_block_scales_4gpus[tp4-mtp_nextn=0-fp8kv=True-attention_dp=False-cuda_graph=False-overlap_scheduler=False-torch_compile=False]

0 commit comments

Comments
 (0)