[CI][chore] Print device memory information for each case

HuiGao-NV · HuiGao-NV · commit 64fd723e64f8 · 2025-12-27T01:08:28.000Z
Signed-off-by: Hui Gao &lt;huig@nvidia.com&gt;
diff --git a/tensorrt_llm/_torch/pyexecutor/py_executor.py b/tensorrt_llm/_torch/pyexecutor/py_executor.py
@@ -452,6 +452,12 @@ def shutdown(self):
         """
         Signals the server to shutdown.
         """
+        import traceback
+        traceback.print_stack()
+        import os
+        print(
+            f"====================== shutdown in executor is called pid:  {os.getpid()}"
+        )
         self.executor_request_queue.enqueue_shutdown_request()
         self.shutdown_event.wait()
         self.worker_thread.join()
diff --git a/tensorrt_llm/_torch/pyexecutor/py_executor_creator.py b/tensorrt_llm/_torch/pyexecutor/py_executor_creator.py
@@ -221,7 +221,10 @@ def create_py_executor(
     tokenizer: Optional[TokenizerBase] = None,
     profiling_stage_data: Optional[dict] = None,
 ) -> PyExecutor:
-
+    # import os
+    # print(f"====================== create_py_executor pid:  {os.getpid()}")
+    # import traceback
+    # print(f"====================== backtrace: {traceback.print_stack()}")
     garbage_collection_gen0_threshold = llm_args.garbage_collection_gen0_threshold
     lora_config = llm_args.lora_config
     kv_connector_config = llm_args.kv_connector_config
diff --git a/tensorrt_llm/commands/serve.py b/tensorrt_llm/commands/serve.py
@@ -44,6 +44,9 @@
 
 def _signal_handler_cleanup_child(signum, frame):
     """Signal handler to clean up the child process."""
+    print(
+        f"================================================ server received signal {signal.Signals(signum).name}"
+    )
     global _child_p_global
     if _child_p_global and _child_p_global.poll() is None:
         # Using print for safety in signal handlers
diff --git a/tensorrt_llm/executor/rpc_proxy.py b/tensorrt_llm/executor/rpc_proxy.py
@@ -187,8 +187,17 @@ def abort_request(self, request_id: int) -> None:
         return self.rpc_client.abort_request(request_id).remote()
 
     def shutdown(self):
+        import traceback
+        traceback.print_stack()
+        import os
+        print(
+            f"====================== shutdown in generator is called pid:  {os.getpid()}"
+        )
         if self._shutdown_event.is_set():
             return
+        print(
+            f"====================== shutdown in generator 2 is called pid:  {os.getpid()}"
+        )
         self._shutdown_event.set()
         logger_debug(f"Shutting down GenerationExecutorRpcProxy",
                      color="yellow")
diff --git a/tensorrt_llm/llmapi/llm.py b/tensorrt_llm/llmapi/llm.py
@@ -172,6 +172,10 @@ def __init__(self,
                         f"{self.__class__.__name__} got invalid argument: {key}"
                     )
 
+            import os
+            print(
+                f"====================== llm class is: {llm_args_cls} pid:  {os.getpid()}"
+            )
             self.args = llm_args_cls.from_kwargs(
                 model=model,
                 tokenizer=tokenizer,
@@ -814,6 +818,10 @@ def _try_load_hf_model_config(
 
     @set_api_status("beta")
     def shutdown(self) -> None:
+        import traceback
+        traceback.print_stack()
+        import os
+        print(f"====================== shutdown is called pid:  {os.getpid()}")
         if hasattr(self, "_executor") and self._executor is not None:
             self._executor.shutdown()
             self._executor = None
@@ -837,6 +845,10 @@ def _check_health(self) -> bool:
     def _shutdown_wrapper(self_ref):
         # Retrieve the instance if it still exists
         instance = self_ref()
+        import traceback
+        traceback.print_stack()
+        import os
+        print(f"====================== shutdown is called pid:  {os.getpid()}")
         if instance is not None:
             instance.shutdown()
 
@@ -848,6 +860,10 @@ def __exit__(
     ) -> Literal[
             False]:  # https://github.com/microsoft/pyright/issues/7009#issuecomment-1894135045
         del exc_value, traceback
+        import traceback
+        traceback.print_stack()
+        import os
+        print(f"====================== shutdown is called pid:  {os.getpid()}")
         self.shutdown()
         return False  # propagate exceptions
 
diff --git a/tests/integration/defs/accuracy/test_llm_api_pytorch.py b/tests/integration/defs/accuracy/test_llm_api_pytorch.py
@@ -14,6 +14,7 @@
 # limitations under the License.
 import os
 import sys
+import time
 
 import pytest
 import torch
@@ -59,9 +60,9 @@ def patched_start_mpi_pool(self):
 from tensorrt_llm.quantization import QuantAlgo
 
 from ..conftest import (get_device_count, get_device_memory, llm_models_root,
-                        parametrize_with_ids, skip_no_hopper,
-                        skip_post_blackwell, skip_pre_ada, skip_pre_blackwell,
-                        skip_pre_hopper, skip_ray)
+                        parametrize_with_ids, print_device_memory,
+                        skip_no_hopper, skip_post_blackwell, skip_pre_ada,
+                        skip_pre_blackwell, skip_pre_hopper, skip_ray)
 from .accuracy_core import (GSM8K, MMLU, CnnDailymail, GPQADiamond,
                             JsonModeEval, LlmapiAccuracyTestHarness,
                             LongBenchV2)
@@ -533,7 +534,9 @@ class TestLlama3_2_1B(LlmapiAccuracyTestHarness):
     MODEL_PATH = f"{llm_models_root()}/llama-3.2-models/Llama-3.2-1B"
     EXAMPLE_FOLDER = "models/core/llama"
 
-    def test_auto_dtype(self):
+    @pytest.mark.parametrize("pp_size", [2, 4], ids=["pp2", "pp4"])
+    def test_auto_dtype(self, pp_size):
+        print_device_memory()
         with LLM(self.MODEL_PATH) as llm:
             task = CnnDailymail(self.MODEL_NAME)
             task.evaluate(llm)
@@ -1328,6 +1331,7 @@ class TestDeepSeekV3Lite(LlmapiAccuracyTestHarness):
     @parametrize_with_ids("mtp_nextn", [0, 2])
     def test_bfloat16(self, mtp_nextn, attention_dp, cuda_graph,
                       overlap_scheduler, torch_compile, enable_chunked_prefill):
+        print_device_memory()
         kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.75)
         torch_compile_config = TorchCompileConfig(
             enable_fullgraph=True,
@@ -1351,6 +1355,11 @@ def test_bfloat16(self, mtp_nextn, attention_dp, cuda_graph,
                  speculative_config=mtp_config) as llm:
             task = GSM8K(self.MODEL_NAME)
             task.evaluate(llm)
+            print_device_memory()
+
+        time.sleep(60)
+        print(f"================= print mem after 60s")
+        print_device_memory()
 
     @pytest.mark.skip_less_device_memory(60000)
     def test_bfloat16_2_model_mtp(self):
@@ -1406,6 +1415,10 @@ def test_bfloat16_4gpus(self, tp_size, pp_size, ep_size, mtp_nextn,
         mtp_config = None
         if mtp_nextn > 0:
             mtp_config = MTPDecodingConfig(num_nextn_predict_layers=mtp_nextn)
+
+        #time.sleep(5)
+        print(f"================= print mem before testing")
+        print_device_memory()
         with LLM(self.MODEL_PATH,
                  tensor_parallel_size=tp_size,
                  pipeline_parallel_size=pp_size,
@@ -1417,6 +1430,18 @@ def test_bfloat16_4gpus(self, tp_size, pp_size, ep_size, mtp_nextn,
                  speculative_config=mtp_config) as llm:
             task = GSM8K(self.MODEL_NAME)
             task.evaluate(llm)
+            print(f"================= print mem after testing")
+            print_device_memory()
+
+        #time.sleep(5)
+        print(f"================= print mem after testing outside")
+        print_device_memory()
+
+        print(f"++++++++++++++++++++++++++++++++++++++++\n\n\n")
+
+        #time.sleep(60)
+        #print(f"================= print mem after 60s")
+        #print_device_memory()
 
     @skip_pre_hopper
     @parametrize_with_ids("torch_compile", [False, True])
@@ -2263,6 +2288,13 @@ def test_nvfp4_multi_gpus(self, tp_size, pp_size, ep_size, mtp_nextn, fp8kv,
         if moe_backend == "TRTLLM" and sm_version in (120, 121):
             pytest.skip(f"{moe_backend} backend does not support SM 120 or 121")
 
+        import gc
+        gc.collect()
+        torch.cuda.empty_cache()
+
+        print(f"\n--- nvidia-smi start to test  ---")
+        print_device_memory()
+
         kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.70)
         pytorch_config = dict(
             disable_overlap_scheduler=not overlap_scheduler,
@@ -2297,9 +2329,19 @@ def test_nvfp4_multi_gpus(self, tp_size, pp_size, ep_size, mtp_nextn, fp8kv,
             task = CnnDailymail(self.MODEL_NAME)
             task.evaluate(llm)
             # Commented out because GPQA takes too long to run
-            # task = GPQADiamond(self.MODEL_NAME)
-            # task.evaluate(llm,
-            #               extra_evaluator_kwargs=dict(apply_chat_template=True))
+            task = GPQADiamond(self.MODEL_NAME)
+            task.evaluate(llm,
+                          extra_evaluator_kwargs=dict(apply_chat_template=True))
+            print("=================================== test finishes")
+            print_device_memory()
+
+        import gc
+        gc.collect()
+        torch.cuda.empty_cache()
+
+        time.sleep(180)
+        print(f"\n--- nvidia-smi after testing after 180s  ---")
+        print_device_memory()
 
     @skip_pre_blackwell
     @pytest.mark.parametrize(
diff --git a/tests/integration/defs/conftest.py b/tests/integration/defs/conftest.py
@@ -2004,40 +2004,65 @@ def get_device_count():
     return len(get_gpu_device_list())
 
 
-def get_device_memory():
-    "get gpu memory"
-    memory = 0
+def get_device_memory_str():
     with tempfile.TemporaryDirectory() as temp_dirname:
         suffix = ".exe" if is_windows() else ""
-        # TODO: Use NRSU because we can't assume nvidia-smi across all platforms.
         cmd = " ".join([
-            "nvidia-smi" + suffix, "--query-gpu=memory.total",
+            "nvidia-smi" + suffix,
+            "--query-gpu=memory.total,memory.reserved,memory.used,memory.free",
             "--format=csv,noheader"
         ])
-        # Try to get memory from nvidia-smi first, if failed, fallback to system memory from /proc/meminfo
-        # This fallback is needed for systems with unified memory (e.g. DGX Spark)
+        output = check_output(cmd, shell=True, cwd=temp_dirname)
+        return output.strip()
+
+
+def get_device_memory():
+    "get gpu memory"
+    memory = 0
+    # Try to get memory from nvidia-smi first, if failed, fallback to system memory from /proc/meminfo
+    # This fallback is needed for systems with unified memory (e.g. DGX Spark)
+    try:
+        output = get_device_memory_str()
+        memory_str = output.strip().split()[0]
+        # Check if nvidia-smi returned a valid numeric value
+        if "N/A" in memory_str:
+            raise ValueError("nvidia-smi returned invalid memory info")
+        memory = int(memory_str)
+    except (sp.CalledProcessError, ValueError, IndexError):
+        # Fallback to system memory from /proc/meminfo (in kB, convert to MiB)
         try:
-            output = check_output(cmd, shell=True, cwd=temp_dirname)
-            memory_str = output.strip().split()[0]
-            # Check if nvidia-smi returned a valid numeric value
-            if "N/A" in memory_str:
-                raise ValueError("nvidia-smi returned invalid memory info")
-            memory = int(memory_str)
-        except (sp.CalledProcessError, ValueError, IndexError):
-            # Fallback to system memory from /proc/meminfo (in kB, convert to MiB)
-            try:
-                with open("/proc/meminfo", "r") as f:
-                    for line in f:
-                        if line.startswith("MemTotal:"):
-                            memory = int(
-                                line.split()[1]) // 1024  # Convert kB to MiB
-                            break
-            except:
-                memory = 8192  # Default 8GB if all else fails
+            with open("/proc/meminfo", "r") as f:
+                for line in f:
+                    if line.startswith("MemTotal:"):
+                        memory = int(
+                            line.split()[1]) // 1024  # Convert kB to MiB
+                        break
+        except:
+            memory = 8192  # Default 8GB if all else fails
 
     return memory
 
 
+def print_device_memory():
+    memory_str = get_device_memory_str()
+    print(f"Device Memory:\ntotal:   reserved:   used:   free:  \n{memory_str}")
+
+    mem_stats = torch.cuda.memory_stats()
+    torch_used_bytes = mem_stats["allocated_bytes.all.current"]
+    torch_used_bytes = mem_stats["reserved_bytes.all.current"]
+    print(
+        f"================================== torch mem stats: allocated {torch_used_bytes}  reserved  {torch_used_bytes}"
+    )
+    print(f"\n--- nvidia-smi in print_device_memory  ---")
+    sp.run(["nvidia-smi"], check=False)
+
+    end, total_gpu_memory = torch.cuda.mem_get_info()
+    total_used_bytes = total_gpu_memory - end
+    print(
+        f"================================== torch mem info: free {end}, total {total_gpu_memory},  used {total_used_bytes}"
+    )
+
+
 def pytest_addoption(parser):
     parser.addoption(
         "--test-list",
diff --git a/tests/integration/test_lists/test-db/l0_gb200_multi_nodes.yml b/tests/integration/test_lists/test-db/l0_gb200_multi_nodes.yml
@@ -32,10 +32,10 @@ l0_gb200_multi_nodes:
       backend: pytorch
   tests:
   - accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_nvfp4_multi_gpus[latency] TIMEOUT (180)
-  - accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_nvfp4_multi_gpus[throughput_tp8] TIMEOUT (180)
-  - accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_nvfp4_multi_gpus[latency_trtllmgen] TIMEOUT (180)
-  - accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_nvfp4_multi_gpus[latency_adp_lmtp] TIMEOUT (180)
-  - accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_nvfp4_multi_gpus[latency_trtllmgen_adp_lmtp] TIMEOUT (180)
+  - accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_nvfp4_multi_gpus[throughput_tp8] TIMEOUT (180) ISOLATION
+  - accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_nvfp4_multi_gpus[latency_trtllmgen] TIMEOUT (180) ISOLATION
+  - accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_nvfp4_multi_gpus[latency_adp_lmtp] TIMEOUT (180) ISOLATION
+  - accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_nvfp4_multi_gpus[latency_trtllmgen_adp_lmtp] TIMEOUT (180) ISOLATION
   - accuracy/test_llm_api_pytorch.py::TestQwen3_235B_A22B::test_nvfp4[latency_moe_cutlass] TIMEOUT (90)
   - accuracy/test_llm_api_pytorch.py::TestQwen3_235B_A22B::test_nvfp4[latency_moe_trtllm] TIMEOUT (90)
   - accuracy/test_llm_api_pytorch.py::TestQwen3_235B_A22B::test_nvfp4[latency_moe_trtllm_attention_dp] TIMEOUT (90)
diff --git a/tests/integration/test_lists/waives.txt b/tests/integration/test_lists/waives.txt