@@ -460,8 +460,7 @@ def test_bfloat16(self, mtp_nextn, attention_dp, cuda_graph,
460460 pytest .skip ("https://nvbugs/5252313" )
461461 if torch_compile and attention_dp :
462462 pytest .skip ("https://nvbugs/5252559" )
463- # OOM on H100 with default free_gpu_memory_fraction=0.9
464- kv_cache_config = KvCacheConfig (free_gpu_memory_fraction = 0.6 )
463+ kv_cache_config = KvCacheConfig (free_gpu_memory_fraction = 0.9 )
465464 pytorch_config = dict (
466465 disable_overlap_scheduler = not overlap_scheduler ,
467466 use_cuda_graph = cuda_graph ,
@@ -507,8 +506,7 @@ def test_bfloat16_4gpus(self, tp_size, pp_size, ep_size, mtp_nextn,
507506 pytest .skip ("https://nvbugs/5252559" )
508507 if torch_compile and pp_size > 1 :
509508 pytest .skip ("PP with torch.compile is not supported yet." )
510- # OOM on H100 with default free_gpu_memory_fraction=0.9
511- kv_cache_config = KvCacheConfig (free_gpu_memory_fraction = 0.7 )
509+ kv_cache_config = KvCacheConfig (free_gpu_memory_fraction = 0.9 )
512510 pytorch_config = dict (
513511 disable_overlap_scheduler = not overlap_scheduler ,
514512 use_cuda_graph = cuda_graph ,
@@ -552,8 +550,7 @@ def test_fp8_block_scales(self, mtp_nextn, fp8kv, attention_dp, cuda_graph,
552550 pytest .skip ("https://nvbugs/5252313" )
553551 if torch_compile and attention_dp :
554552 pytest .skip ("https://nvbugs/5252559" )
555- # OOM on H100 with default free_gpu_memory_fraction=0.9
556- kv_cache_config = KvCacheConfig (free_gpu_memory_fraction = 0.8 )
553+ kv_cache_config = KvCacheConfig (free_gpu_memory_fraction = 0.9 )
557554 pytorch_config = dict (
558555 disable_overlap_scheduler = not overlap_scheduler ,
559556 use_cuda_graph = cuda_graph ,
@@ -592,8 +589,7 @@ def test_fp8_block_scales(self, mtp_nextn, fp8kv, attention_dp, cuda_graph,
592589
593590 @pytest .mark .skip_device_not_contain (["H100" ])
594591 def test_fp8_block_scales_cuda_graph_padding (self ):
595- # OOM on H100 with default free_gpu_memory_fraction=0.9
596- kv_cache_config = KvCacheConfig (free_gpu_memory_fraction = 0.8 )
592+ kv_cache_config = KvCacheConfig (free_gpu_memory_fraction = 0.9 )
597593 pytorch_config = dict (
598594 disable_overlap_scheduler = False ,
599595 use_cuda_graph = True ,
@@ -638,8 +634,7 @@ def test_fp8_block_scales_4gpus(self, tp_size, pp_size, ep_size, mtp_nextn,
638634 pytest .skip ("https://nvbugs/5252559" )
639635 if torch_compile and pp_size > 1 :
640636 pytest .skip ("PP with torch.compile is not supported yet." )
641- # OOM on H100 with default free_gpu_memory_fraction=0.9
642- kv_cache_config = KvCacheConfig (free_gpu_memory_fraction = 0.8 )
637+ kv_cache_config = KvCacheConfig (free_gpu_memory_fraction = 0.9 )
643638 pytorch_config = dict (
644639 disable_overlap_scheduler = not overlap_scheduler ,
645640 use_cuda_graph = cuda_graph ,
@@ -682,8 +677,7 @@ def test_fp8_block_scales_4gpus(self, tp_size, pp_size, ep_size, mtp_nextn,
682677 @pytest .mark .skip_less_device (4 )
683678 @pytest .mark .skip_device_not_contain (["H100" , "H200" ])
684679 def test_fp8_block_scales_4gpus_static_eplb (self ):
685- # OOM on H100 with default free_gpu_memory_fraction=0.9
686- kv_cache_config = KvCacheConfig (free_gpu_memory_fraction = 0.8 )
680+ kv_cache_config = KvCacheConfig (free_gpu_memory_fraction = 0.9 )
687681
688682 num_experts = 72
689683 num_slots = 80
@@ -840,7 +834,7 @@ def test_no_kv_cache_reuse(self, quant_dtype, mtp_nextn, fp8kv,
840834 elif quant_dtype == "nvfp4" :
841835 model_path = f"{ llm_models_root ()} /DeepSeek-V3-Lite/nvfp4_moe_only"
842836
843- kv_cache_config = KvCacheConfig (free_gpu_memory_fraction = 0.6 ,
837+ kv_cache_config = KvCacheConfig (free_gpu_memory_fraction = 0.9 ,
844838 enable_block_reuse = False )
845839 pytorch_config = dict (
846840 disable_overlap_scheduler = not overlap_scheduler ,
@@ -907,7 +901,7 @@ def test_nvfp4_8gpus(self, tp_size, pp_size, ep_size, mtp_nextn, fp8kv,
907901 attention_dp , cuda_graph , overlap_scheduler ,
908902 max_batch_size , moe_backend ):
909903
910- kv_cache_config = KvCacheConfig (free_gpu_memory_fraction = 0.4 )
904+ kv_cache_config = KvCacheConfig (free_gpu_memory_fraction = 0.9 )
911905 pytorch_config = dict (disable_overlap_scheduler = not overlap_scheduler ,
912906 use_cuda_graph = cuda_graph ,
913907 moe_backend = moe_backend )
@@ -956,7 +950,7 @@ def test_nvfp4_8gpus(self, tp_size, pp_size, ep_size, mtp_nextn, fp8kv,
956950 def test_fp8_blockscale (self , tp_size , pp_size , ep_size , mtp_nextn , fp8kv ,
957951 attention_dp , cuda_graph , overlap_scheduler ,
958952 max_batch_size ):
959- kv_cache_config = KvCacheConfig (free_gpu_memory_fraction = 0.4 )
953+ kv_cache_config = KvCacheConfig (free_gpu_memory_fraction = 0.9 )
960954 pytorch_config = dict (
961955 disable_overlap_scheduler = not overlap_scheduler ,
962956 use_cuda_graph = cuda_graph ,
@@ -1010,7 +1004,7 @@ class TestNemotronNas(LlmapiAccuracyTestHarness):
10101004
10111005 @pytest .mark .skip_less_device (8 )
10121006 def test_auto_dtype_tp8 (self ):
1013- kv_cache_config = KvCacheConfig (free_gpu_memory_fraction = 0.7 )
1007+ kv_cache_config = KvCacheConfig (free_gpu_memory_fraction = 0.9 )
10141008 pytorch_config = dict ()
10151009
10161010 with LLM (self .MODEL_PATH ,
@@ -1306,7 +1300,7 @@ def test_fp8(self, tp_size, pp_size, ep_size, attention_dp, cuda_graph,
13061300 pytorch_config = dict (disable_overlap_scheduler = not overlap_scheduler ,
13071301 use_cuda_graph = cuda_graph )
13081302
1309- kv_cache_config = KvCacheConfig (free_gpu_memory_fraction = 0.7 )
1303+ kv_cache_config = KvCacheConfig (free_gpu_memory_fraction = 0.9 )
13101304 llm = LLM (
13111305 f"{ llm_models_root ()} /Qwen3/saved_models_Qwen3-235B-A22B_fp8_hf" ,
13121306 tensor_parallel_size = tp_size ,
0 commit comments