[CI][chore] Print device memory information for each case

HuiGao-NV · HuiGao-NV · commit fe1886ffbfe2 · 2025-12-02T12:44:54.000Z
Signed-off-by: Hui Gao &lt;huig@nvidia.com&gt;
diff --git a/tests/integration/defs/accuracy/test_llm_api_pytorch.py b/tests/integration/defs/accuracy/test_llm_api_pytorch.py
@@ -28,7 +28,7 @@
                                  SamplingParams, TorchCompileConfig)
 from tensorrt_llm.quantization import QuantAlgo
 
-from ..conftest import (get_device_count, get_device_memory, llm_models_root,
+from ..conftest import (get_device_count, get_device_memory, print_device_memory, llm_models_root,
                         parametrize_with_ids, skip_no_hopper,
                         skip_post_blackwell, skip_pre_ada, skip_pre_blackwell,
                         skip_pre_hopper, skip_ray)
@@ -2186,6 +2186,7 @@ def test_nvfp4_multi_gpus(self, tp_size, pp_size, ep_size, mtp_nextn, fp8kv,
                               attention_dp, enable_lm_head_tp_in_adp,
                               cuda_graph, overlap_scheduler, max_batch_size,
                               moe_backend):
+        print_device_memory()
         if moe_backend == "TRTLLM" and (get_sm_version() == 120
                                         or get_sm_version() == 121):
             pytest.skip(
diff --git a/tests/integration/defs/conftest.py b/tests/integration/defs/conftest.py
@@ -1992,40 +1992,52 @@ def get_device_count():
     return len(get_gpu_device_list())
 
 
-def get_device_memory():
-    "get gpu memory"
-    memory = 0
+def get_device_memory_str():
     with tempfile.TemporaryDirectory() as temp_dirname:
         suffix = ".exe" if is_windows() else ""
-        # TODO: Use NRSU because we can't assume nvidia-smi across all platforms.
         cmd = " ".join([
-            "nvidia-smi" + suffix, "--query-gpu=memory.total",
-            "--format=csv,noheader"
-        ])
-        # Try to get memory from nvidia-smi first, if failed, fallback to system memory from /proc/meminfo
-        # This fallback is needed for systems with unified memory (e.g. DGX Spark)
+                "nvidia-smi" + suffix, "--query-gpu=memory.total,memory.reserved,memory.used,memory.free",
+                "--format=csv,noheader"
+            ])
+        output = check_output(cmd, shell=True, cwd=temp_dirname)
+        return output.strip()
+
+def get_device_memory():
+    "get gpu memory"
+    memory = 0
+    # Try to get memory from nvidia-smi first, if failed, fallback to system memory from /proc/meminfo
+    # This fallback is needed for systems with unified memory (e.g. DGX Spark)
+    try:
+        output = get_device_memory_str()
+        memory_str = output.strip().split()[0]
+        # Check if nvidia-smi returned a valid numeric value
+        if "N/A" in memory_str:
+            raise ValueError("nvidia-smi returned invalid memory info")
+        memory = int(memory_str)
+    except (sp.CalledProcessError, ValueError, IndexError):
+        # Fallback to system memory from /proc/meminfo (in kB, convert to MiB)
         try:
-            output = check_output(cmd, shell=True, cwd=temp_dirname)
-            memory_str = output.strip().split()[0]
-            # Check if nvidia-smi returned a valid numeric value
-            if "N/A" in memory_str:
-                raise ValueError("nvidia-smi returned invalid memory info")
-            memory = int(memory_str)
-        except (sp.CalledProcessError, ValueError, IndexError):
-            # Fallback to system memory from /proc/meminfo (in kB, convert to MiB)
-            try:
-                with open("/proc/meminfo", "r") as f:
-                    for line in f:
-                        if line.startswith("MemTotal:"):
-                            memory = int(
-                                line.split()[1]) // 1024  # Convert kB to MiB
-                            break
-            except:
-                memory = 8192  # Default 8GB if all else fails
+            with open("/proc/meminfo", "r") as f:
+                for line in f:
+                    if line.startswith("MemTotal:"):
+                        memory = int(
+                            line.split()[1]) // 1024  # Convert kB to MiB
+                        break
+        except:
+            memory = 8192  # Default 8GB if all else fails
 
     return memory
 
 
+def print_device_memory():
+    memory_str = get_device_memory_str()
+    print(f"Device Memory:\ntotal:   reserved:   used:   free:  \n{memory_str}")
+    torch.cuda.empty_cache()
+    import gc
+    gc.collect()
+    memory_str = get_device_memory_str()
+    print(f"Device Memory:\ntotal:   reserved:   used:   free:  \n{memory_str}")
+
 def pytest_addoption(parser):
     parser.addoption(
         "--test-list",
diff --git a/tests/integration/test_lists/waives.txt b/tests/integration/test_lists/waives.txt
@@ -408,7 +408,6 @@ triton_server/test_triton_llm.py::test_llmapi_backend[4-0-disableDecoupleMode-te
 accuracy/test_disaggregated_serving.py::TestDeepSeekV3Lite::test_auto_dtype[mtp_nextn=0-overlap_scheduler=False] SKIP (https://nvbugs/5701491)
 accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16_4gpus[ep4-mtp_nextn=0-attention_dp=False-cuda_graph=False-overlap_scheduler=False-torch_compile=False] SKIP (https://nvbugs/5701425)
 accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16_4gpus[tp4-mtp_nextn=2-attention_dp=False-cuda_graph=True-overlap_scheduler=True-torch_compile=False] SKIP (https://nvbugs/5701425)
-accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_nvfp4_multi_gpus[throughput_tp8] SKIP (https://nvbugs/5698897)
 unittest/_torch/modules/tests_lora_modules/test_lora_attention_pytorch_flow_vs_trt.py::TestLoraAttentionPytorchFlowVsTRT::test_lora_attention SKIP (https://nvbugs/5701421)
 unittest/llmapi/test_llm_pytorch.py::test_embedding_bias_with_torch_sampler_strategies SKIP (https://nvbugs/5702791)
 accuracy/test_llm_api_pytorch.py::TestQwen3NextInstruct::test_nvfp4[no_cuda_graph_overlap-cutlass] SKIP (https://nvbugs/5702795)