Skip to content

Commit a152635

Browse files
authored
[TRTLLM-5630] restore free_gpu_memory_fraction=0.9 in tests (NVIDIA#4859)
Signed-off-by: ixlmar <[email protected]>
1 parent b8c5e38 commit a152635

File tree

1 file changed

+11
-17
lines changed

1 file changed

+11
-17
lines changed

tests/integration/defs/accuracy/test_llm_api_pytorch.py

Lines changed: 11 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -460,8 +460,7 @@ def test_bfloat16(self, mtp_nextn, attention_dp, cuda_graph,
460460
pytest.skip("https://nvbugs/5252313")
461461
if torch_compile and attention_dp:
462462
pytest.skip("https://nvbugs/5252559")
463-
# OOM on H100 with default free_gpu_memory_fraction=0.9
464-
kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.6)
463+
kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.9)
465464
pytorch_config = dict(
466465
disable_overlap_scheduler=not overlap_scheduler,
467466
use_cuda_graph=cuda_graph,
@@ -507,8 +506,7 @@ def test_bfloat16_4gpus(self, tp_size, pp_size, ep_size, mtp_nextn,
507506
pytest.skip("https://nvbugs/5252559")
508507
if torch_compile and pp_size > 1:
509508
pytest.skip("PP with torch.compile is not supported yet.")
510-
# OOM on H100 with default free_gpu_memory_fraction=0.9
511-
kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.7)
509+
kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.9)
512510
pytorch_config = dict(
513511
disable_overlap_scheduler=not overlap_scheduler,
514512
use_cuda_graph=cuda_graph,
@@ -552,8 +550,7 @@ def test_fp8_block_scales(self, mtp_nextn, fp8kv, attention_dp, cuda_graph,
552550
pytest.skip("https://nvbugs/5252313")
553551
if torch_compile and attention_dp:
554552
pytest.skip("https://nvbugs/5252559")
555-
# OOM on H100 with default free_gpu_memory_fraction=0.9
556-
kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.8)
553+
kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.9)
557554
pytorch_config = dict(
558555
disable_overlap_scheduler=not overlap_scheduler,
559556
use_cuda_graph=cuda_graph,
@@ -592,8 +589,7 @@ def test_fp8_block_scales(self, mtp_nextn, fp8kv, attention_dp, cuda_graph,
592589

593590
@pytest.mark.skip_device_not_contain(["H100"])
594591
def test_fp8_block_scales_cuda_graph_padding(self):
595-
# OOM on H100 with default free_gpu_memory_fraction=0.9
596-
kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.8)
592+
kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.9)
597593
pytorch_config = dict(
598594
disable_overlap_scheduler=False,
599595
use_cuda_graph=True,
@@ -638,8 +634,7 @@ def test_fp8_block_scales_4gpus(self, tp_size, pp_size, ep_size, mtp_nextn,
638634
pytest.skip("https://nvbugs/5252559")
639635
if torch_compile and pp_size > 1:
640636
pytest.skip("PP with torch.compile is not supported yet.")
641-
# OOM on H100 with default free_gpu_memory_fraction=0.9
642-
kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.8)
637+
kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.9)
643638
pytorch_config = dict(
644639
disable_overlap_scheduler=not overlap_scheduler,
645640
use_cuda_graph=cuda_graph,
@@ -682,8 +677,7 @@ def test_fp8_block_scales_4gpus(self, tp_size, pp_size, ep_size, mtp_nextn,
682677
@pytest.mark.skip_less_device(4)
683678
@pytest.mark.skip_device_not_contain(["H100", "H200"])
684679
def test_fp8_block_scales_4gpus_static_eplb(self):
685-
# OOM on H100 with default free_gpu_memory_fraction=0.9
686-
kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.8)
680+
kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.9)
687681

688682
num_experts = 72
689683
num_slots = 80
@@ -840,7 +834,7 @@ def test_no_kv_cache_reuse(self, quant_dtype, mtp_nextn, fp8kv,
840834
elif quant_dtype == "nvfp4":
841835
model_path = f"{llm_models_root()}/DeepSeek-V3-Lite/nvfp4_moe_only"
842836

843-
kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.6,
837+
kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.9,
844838
enable_block_reuse=False)
845839
pytorch_config = dict(
846840
disable_overlap_scheduler=not overlap_scheduler,
@@ -907,7 +901,7 @@ def test_nvfp4_8gpus(self, tp_size, pp_size, ep_size, mtp_nextn, fp8kv,
907901
attention_dp, cuda_graph, overlap_scheduler,
908902
max_batch_size, moe_backend):
909903

910-
kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.4)
904+
kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.9)
911905
pytorch_config = dict(disable_overlap_scheduler=not overlap_scheduler,
912906
use_cuda_graph=cuda_graph,
913907
moe_backend=moe_backend)
@@ -956,7 +950,7 @@ def test_nvfp4_8gpus(self, tp_size, pp_size, ep_size, mtp_nextn, fp8kv,
956950
def test_fp8_blockscale(self, tp_size, pp_size, ep_size, mtp_nextn, fp8kv,
957951
attention_dp, cuda_graph, overlap_scheduler,
958952
max_batch_size):
959-
kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.4)
953+
kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.9)
960954
pytorch_config = dict(
961955
disable_overlap_scheduler=not overlap_scheduler,
962956
use_cuda_graph=cuda_graph,
@@ -1010,7 +1004,7 @@ class TestNemotronNas(LlmapiAccuracyTestHarness):
10101004

10111005
@pytest.mark.skip_less_device(8)
10121006
def test_auto_dtype_tp8(self):
1013-
kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.7)
1007+
kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.9)
10141008
pytorch_config = dict()
10151009

10161010
with LLM(self.MODEL_PATH,
@@ -1306,7 +1300,7 @@ def test_fp8(self, tp_size, pp_size, ep_size, attention_dp, cuda_graph,
13061300
pytorch_config = dict(disable_overlap_scheduler=not overlap_scheduler,
13071301
use_cuda_graph=cuda_graph)
13081302

1309-
kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.7)
1303+
kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.9)
13101304
llm = LLM(
13111305
f"{llm_models_root()}/Qwen3/saved_models_Qwen3-235B-A22B_fp8_hf",
13121306
tensor_parallel_size=tp_size,

0 commit comments

Comments
 (0)