1414# limitations under the License.
1515import os
1616import sys
17+ import time
1718
1819import pytest
1920import torch
@@ -59,9 +60,9 @@ def patched_start_mpi_pool(self):
5960from tensorrt_llm .quantization import QuantAlgo
6061
6162from ..conftest import (get_device_count , get_device_memory , llm_models_root ,
62- parametrize_with_ids , skip_no_hopper ,
63- skip_post_blackwell , skip_pre_ada , skip_pre_blackwell ,
64- skip_pre_hopper , skip_ray )
63+ parametrize_with_ids , print_device_memory ,
64+ skip_no_hopper , skip_post_blackwell , skip_pre_ada ,
65+ skip_pre_blackwell , skip_pre_hopper , skip_ray )
6566from .accuracy_core import (GSM8K , MMLU , CnnDailymail , GPQADiamond ,
6667 JsonModeEval , LlmapiAccuracyTestHarness ,
6768 LongBenchV2 )
@@ -533,7 +534,9 @@ class TestLlama3_2_1B(LlmapiAccuracyTestHarness):
533534 MODEL_PATH = f"{ llm_models_root ()} /llama-3.2-models/Llama-3.2-1B"
534535 EXAMPLE_FOLDER = "models/core/llama"
535536
536- def test_auto_dtype (self ):
537+ @pytest .mark .parametrize ("pp_size" , [2 , 4 ], ids = ["pp2" , "pp4" ])
538+ def test_auto_dtype (self , pp_size ):
539+ print_device_memory ()
537540 with LLM (self .MODEL_PATH ) as llm :
538541 task = CnnDailymail (self .MODEL_NAME )
539542 task .evaluate (llm )
@@ -1328,6 +1331,7 @@ class TestDeepSeekV3Lite(LlmapiAccuracyTestHarness):
13281331 @parametrize_with_ids ("mtp_nextn" , [0 , 2 ])
13291332 def test_bfloat16 (self , mtp_nextn , attention_dp , cuda_graph ,
13301333 overlap_scheduler , torch_compile , enable_chunked_prefill ):
1334+ print_device_memory ()
13311335 kv_cache_config = KvCacheConfig (free_gpu_memory_fraction = 0.75 )
13321336 torch_compile_config = TorchCompileConfig (
13331337 enable_fullgraph = True ,
@@ -1351,6 +1355,11 @@ def test_bfloat16(self, mtp_nextn, attention_dp, cuda_graph,
13511355 speculative_config = mtp_config ) as llm :
13521356 task = GSM8K (self .MODEL_NAME )
13531357 task .evaluate (llm )
1358+ print_device_memory ()
1359+
1360+ time .sleep (60 )
1361+ print (f"================= print mem after 60s" )
1362+ print_device_memory ()
13541363
13551364 @pytest .mark .skip_less_device_memory (60000 )
13561365 def test_bfloat16_2_model_mtp (self ):
@@ -1406,6 +1415,10 @@ def test_bfloat16_4gpus(self, tp_size, pp_size, ep_size, mtp_nextn,
14061415 mtp_config = None
14071416 if mtp_nextn > 0 :
14081417 mtp_config = MTPDecodingConfig (num_nextn_predict_layers = mtp_nextn )
1418+
1419+ #time.sleep(5)
1420+ print (f"================= print mem before testing" )
1421+ print_device_memory ()
14091422 with LLM (self .MODEL_PATH ,
14101423 tensor_parallel_size = tp_size ,
14111424 pipeline_parallel_size = pp_size ,
@@ -1417,6 +1430,18 @@ def test_bfloat16_4gpus(self, tp_size, pp_size, ep_size, mtp_nextn,
14171430 speculative_config = mtp_config ) as llm :
14181431 task = GSM8K (self .MODEL_NAME )
14191432 task .evaluate (llm )
1433+ print (f"================= print mem after testing" )
1434+ print_device_memory ()
1435+
1436+ #time.sleep(5)
1437+ print (f"================= print mem after testing outside" )
1438+ print_device_memory ()
1439+
1440+ print (f"++++++++++++++++++++++++++++++++++++++++\n \n \n " )
1441+
1442+ #time.sleep(60)
1443+ #print(f"================= print mem after 60s")
1444+ #print_device_memory()
14201445
14211446 @skip_pre_hopper
14221447 @parametrize_with_ids ("torch_compile" , [False , True ])
@@ -2263,6 +2288,13 @@ def test_nvfp4_multi_gpus(self, tp_size, pp_size, ep_size, mtp_nextn, fp8kv,
22632288 if moe_backend == "TRTLLM" and sm_version in (120 , 121 ):
22642289 pytest .skip (f"{ moe_backend } backend does not support SM 120 or 121" )
22652290
2291+ import gc
2292+ gc .collect ()
2293+ torch .cuda .empty_cache ()
2294+
2295+ print (f"\n --- nvidia-smi start to test ---" )
2296+ print_device_memory ()
2297+
22662298 kv_cache_config = KvCacheConfig (free_gpu_memory_fraction = 0.70 )
22672299 pytorch_config = dict (
22682300 disable_overlap_scheduler = not overlap_scheduler ,
@@ -2297,9 +2329,19 @@ def test_nvfp4_multi_gpus(self, tp_size, pp_size, ep_size, mtp_nextn, fp8kv,
22972329 task = CnnDailymail (self .MODEL_NAME )
22982330 task .evaluate (llm )
22992331 # Commented out because GPQA takes too long to run
2300- # task = GPQADiamond(self.MODEL_NAME)
2301- # task.evaluate(llm,
2302- # extra_evaluator_kwargs=dict(apply_chat_template=True))
2332+ task = GPQADiamond (self .MODEL_NAME )
2333+ task .evaluate (llm ,
2334+ extra_evaluator_kwargs = dict (apply_chat_template = True ))
2335+ print ("=================================== test finishes" )
2336+ print_device_memory ()
2337+
2338+ import gc
2339+ gc .collect ()
2340+ torch .cuda .empty_cache ()
2341+
2342+ time .sleep (180 )
2343+ print (f"\n --- nvidia-smi after testing after 180s ---" )
2344+ print_device_memory ()
23032345
23042346 @skip_pre_blackwell
23052347 @pytest .mark .parametrize (
0 commit comments