Skip to content

Commit fe1886f

Browse files
committed
[CI][chore] Print device memory information for each case
Signed-off-by: Hui Gao <[email protected]>
1 parent be48cdf commit fe1886f

File tree

3 files changed

+40
-28
lines changed

3 files changed

+40
-28
lines changed

tests/integration/defs/accuracy/test_llm_api_pytorch.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,7 @@
2828
SamplingParams, TorchCompileConfig)
2929
from tensorrt_llm.quantization import QuantAlgo
3030

31-
from ..conftest import (get_device_count, get_device_memory, llm_models_root,
31+
from ..conftest import (get_device_count, get_device_memory, print_device_memory, llm_models_root,
3232
parametrize_with_ids, skip_no_hopper,
3333
skip_post_blackwell, skip_pre_ada, skip_pre_blackwell,
3434
skip_pre_hopper, skip_ray)
@@ -2186,6 +2186,7 @@ def test_nvfp4_multi_gpus(self, tp_size, pp_size, ep_size, mtp_nextn, fp8kv,
21862186
attention_dp, enable_lm_head_tp_in_adp,
21872187
cuda_graph, overlap_scheduler, max_batch_size,
21882188
moe_backend):
2189+
print_device_memory()
21892190
if moe_backend == "TRTLLM" and (get_sm_version() == 120
21902191
or get_sm_version() == 121):
21912192
pytest.skip(

tests/integration/defs/conftest.py

Lines changed: 38 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -1992,40 +1992,52 @@ def get_device_count():
19921992
return len(get_gpu_device_list())
19931993

19941994

1995-
def get_device_memory():
1996-
"get gpu memory"
1997-
memory = 0
1995+
def get_device_memory_str():
19981996
with tempfile.TemporaryDirectory() as temp_dirname:
19991997
suffix = ".exe" if is_windows() else ""
2000-
# TODO: Use NRSU because we can't assume nvidia-smi across all platforms.
20011998
cmd = " ".join([
2002-
"nvidia-smi" + suffix, "--query-gpu=memory.total",
2003-
"--format=csv,noheader"
2004-
])
2005-
# Try to get memory from nvidia-smi first, if failed, fallback to system memory from /proc/meminfo
2006-
# This fallback is needed for systems with unified memory (e.g. DGX Spark)
1999+
"nvidia-smi" + suffix, "--query-gpu=memory.total,memory.reserved,memory.used,memory.free",
2000+
"--format=csv,noheader"
2001+
])
2002+
output = check_output(cmd, shell=True, cwd=temp_dirname)
2003+
return output.strip()
2004+
2005+
def get_device_memory():
2006+
"get gpu memory"
2007+
memory = 0
2008+
# Try to get memory from nvidia-smi first, if failed, fallback to system memory from /proc/meminfo
2009+
# This fallback is needed for systems with unified memory (e.g. DGX Spark)
2010+
try:
2011+
output = get_device_memory_str()
2012+
memory_str = output.strip().split()[0]
2013+
# Check if nvidia-smi returned a valid numeric value
2014+
if "N/A" in memory_str:
2015+
raise ValueError("nvidia-smi returned invalid memory info")
2016+
memory = int(memory_str)
2017+
except (sp.CalledProcessError, ValueError, IndexError):
2018+
# Fallback to system memory from /proc/meminfo (in kB, convert to MiB)
20072019
try:
2008-
output = check_output(cmd, shell=True, cwd=temp_dirname)
2009-
memory_str = output.strip().split()[0]
2010-
# Check if nvidia-smi returned a valid numeric value
2011-
if "N/A" in memory_str:
2012-
raise ValueError("nvidia-smi returned invalid memory info")
2013-
memory = int(memory_str)
2014-
except (sp.CalledProcessError, ValueError, IndexError):
2015-
# Fallback to system memory from /proc/meminfo (in kB, convert to MiB)
2016-
try:
2017-
with open("/proc/meminfo", "r") as f:
2018-
for line in f:
2019-
if line.startswith("MemTotal:"):
2020-
memory = int(
2021-
line.split()[1]) // 1024 # Convert kB to MiB
2022-
break
2023-
except:
2024-
memory = 8192 # Default 8GB if all else fails
2020+
with open("/proc/meminfo", "r") as f:
2021+
for line in f:
2022+
if line.startswith("MemTotal:"):
2023+
memory = int(
2024+
line.split()[1]) // 1024 # Convert kB to MiB
2025+
break
2026+
except:
2027+
memory = 8192 # Default 8GB if all else fails
20252028

20262029
return memory
20272030

20282031

2032+
def print_device_memory():
2033+
memory_str = get_device_memory_str()
2034+
print(f"Device Memory:\ntotal: reserved: used: free: \n{memory_str}")
2035+
torch.cuda.empty_cache()
2036+
import gc
2037+
gc.collect()
2038+
memory_str = get_device_memory_str()
2039+
print(f"Device Memory:\ntotal: reserved: used: free: \n{memory_str}")
2040+
20292041
def pytest_addoption(parser):
20302042
parser.addoption(
20312043
"--test-list",

tests/integration/test_lists/waives.txt

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -408,7 +408,6 @@ triton_server/test_triton_llm.py::test_llmapi_backend[4-0-disableDecoupleMode-te
408408
accuracy/test_disaggregated_serving.py::TestDeepSeekV3Lite::test_auto_dtype[mtp_nextn=0-overlap_scheduler=False] SKIP (https://nvbugs/5701491)
409409
accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16_4gpus[ep4-mtp_nextn=0-attention_dp=False-cuda_graph=False-overlap_scheduler=False-torch_compile=False] SKIP (https://nvbugs/5701425)
410410
accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16_4gpus[tp4-mtp_nextn=2-attention_dp=False-cuda_graph=True-overlap_scheduler=True-torch_compile=False] SKIP (https://nvbugs/5701425)
411-
accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_nvfp4_multi_gpus[throughput_tp8] SKIP (https://nvbugs/5698897)
412411
unittest/_torch/modules/tests_lora_modules/test_lora_attention_pytorch_flow_vs_trt.py::TestLoraAttentionPytorchFlowVsTRT::test_lora_attention SKIP (https://nvbugs/5701421)
413412
unittest/llmapi/test_llm_pytorch.py::test_embedding_bias_with_torch_sampler_strategies SKIP (https://nvbugs/5702791)
414413
accuracy/test_llm_api_pytorch.py::TestQwen3NextInstruct::test_nvfp4[no_cuda_graph_overlap-cutlass] SKIP (https://nvbugs/5702795)

0 commit comments

Comments
 (0)