Skip to content

Commit 64fd723

Browse files
committed
[CI][chore] Print device memory information for each case
Signed-off-by: Hui Gao <[email protected]>
1 parent 14554ab commit 64fd723

File tree

9 files changed

+140
-65
lines changed

9 files changed

+140
-65
lines changed

tensorrt_llm/_torch/pyexecutor/py_executor.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -452,6 +452,12 @@ def shutdown(self):
452452
"""
453453
Signals the server to shutdown.
454454
"""
455+
import traceback
456+
traceback.print_stack()
457+
import os
458+
print(
459+
f"====================== shutdown in executor is called pid: {os.getpid()}"
460+
)
455461
self.executor_request_queue.enqueue_shutdown_request()
456462
self.shutdown_event.wait()
457463
self.worker_thread.join()

tensorrt_llm/_torch/pyexecutor/py_executor_creator.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -221,7 +221,10 @@ def create_py_executor(
221221
tokenizer: Optional[TokenizerBase] = None,
222222
profiling_stage_data: Optional[dict] = None,
223223
) -> PyExecutor:
224-
224+
# import os
225+
# print(f"====================== create_py_executor pid: {os.getpid()}")
226+
# import traceback
227+
# print(f"====================== backtrace: {traceback.print_stack()}")
225228
garbage_collection_gen0_threshold = llm_args.garbage_collection_gen0_threshold
226229
lora_config = llm_args.lora_config
227230
kv_connector_config = llm_args.kv_connector_config

tensorrt_llm/commands/serve.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -44,6 +44,9 @@
4444

4545
def _signal_handler_cleanup_child(signum, frame):
4646
"""Signal handler to clean up the child process."""
47+
print(
48+
f"================================================ server received signal {signal.Signals(signum).name}"
49+
)
4750
global _child_p_global
4851
if _child_p_global and _child_p_global.poll() is None:
4952
# Using print for safety in signal handlers

tensorrt_llm/executor/rpc_proxy.py

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -187,8 +187,17 @@ def abort_request(self, request_id: int) -> None:
187187
return self.rpc_client.abort_request(request_id).remote()
188188

189189
def shutdown(self):
190+
import traceback
191+
traceback.print_stack()
192+
import os
193+
print(
194+
f"====================== shutdown in generator is called pid: {os.getpid()}"
195+
)
190196
if self._shutdown_event.is_set():
191197
return
198+
print(
199+
f"====================== shutdown in generator 2 is called pid: {os.getpid()}"
200+
)
192201
self._shutdown_event.set()
193202
logger_debug(f"Shutting down GenerationExecutorRpcProxy",
194203
color="yellow")

tensorrt_llm/llmapi/llm.py

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -172,6 +172,10 @@ def __init__(self,
172172
f"{self.__class__.__name__} got invalid argument: {key}"
173173
)
174174

175+
import os
176+
print(
177+
f"====================== llm class is: {llm_args_cls} pid: {os.getpid()}"
178+
)
175179
self.args = llm_args_cls.from_kwargs(
176180
model=model,
177181
tokenizer=tokenizer,
@@ -814,6 +818,10 @@ def _try_load_hf_model_config(
814818

815819
@set_api_status("beta")
816820
def shutdown(self) -> None:
821+
import traceback
822+
traceback.print_stack()
823+
import os
824+
print(f"====================== shutdown is called pid: {os.getpid()}")
817825
if hasattr(self, "_executor") and self._executor is not None:
818826
self._executor.shutdown()
819827
self._executor = None
@@ -837,6 +845,10 @@ def _check_health(self) -> bool:
837845
def _shutdown_wrapper(self_ref):
838846
# Retrieve the instance if it still exists
839847
instance = self_ref()
848+
import traceback
849+
traceback.print_stack()
850+
import os
851+
print(f"====================== shutdown is called pid: {os.getpid()}")
840852
if instance is not None:
841853
instance.shutdown()
842854

@@ -848,6 +860,10 @@ def __exit__(
848860
) -> Literal[
849861
False]: # https://github.com/microsoft/pyright/issues/7009#issuecomment-1894135045
850862
del exc_value, traceback
863+
import traceback
864+
traceback.print_stack()
865+
import os
866+
print(f"====================== shutdown is called pid: {os.getpid()}")
851867
self.shutdown()
852868
return False # propagate exceptions
853869

tests/integration/defs/accuracy/test_llm_api_pytorch.py

Lines changed: 49 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@
1414
# limitations under the License.
1515
import os
1616
import sys
17+
import time
1718

1819
import pytest
1920
import torch
@@ -59,9 +60,9 @@ def patched_start_mpi_pool(self):
5960
from tensorrt_llm.quantization import QuantAlgo
6061

6162
from ..conftest import (get_device_count, get_device_memory, llm_models_root,
62-
parametrize_with_ids, skip_no_hopper,
63-
skip_post_blackwell, skip_pre_ada, skip_pre_blackwell,
64-
skip_pre_hopper, skip_ray)
63+
parametrize_with_ids, print_device_memory,
64+
skip_no_hopper, skip_post_blackwell, skip_pre_ada,
65+
skip_pre_blackwell, skip_pre_hopper, skip_ray)
6566
from .accuracy_core import (GSM8K, MMLU, CnnDailymail, GPQADiamond,
6667
JsonModeEval, LlmapiAccuracyTestHarness,
6768
LongBenchV2)
@@ -533,7 +534,9 @@ class TestLlama3_2_1B(LlmapiAccuracyTestHarness):
533534
MODEL_PATH = f"{llm_models_root()}/llama-3.2-models/Llama-3.2-1B"
534535
EXAMPLE_FOLDER = "models/core/llama"
535536

536-
def test_auto_dtype(self):
537+
@pytest.mark.parametrize("pp_size", [2, 4], ids=["pp2", "pp4"])
538+
def test_auto_dtype(self, pp_size):
539+
print_device_memory()
537540
with LLM(self.MODEL_PATH) as llm:
538541
task = CnnDailymail(self.MODEL_NAME)
539542
task.evaluate(llm)
@@ -1328,6 +1331,7 @@ class TestDeepSeekV3Lite(LlmapiAccuracyTestHarness):
13281331
@parametrize_with_ids("mtp_nextn", [0, 2])
13291332
def test_bfloat16(self, mtp_nextn, attention_dp, cuda_graph,
13301333
overlap_scheduler, torch_compile, enable_chunked_prefill):
1334+
print_device_memory()
13311335
kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.75)
13321336
torch_compile_config = TorchCompileConfig(
13331337
enable_fullgraph=True,
@@ -1351,6 +1355,11 @@ def test_bfloat16(self, mtp_nextn, attention_dp, cuda_graph,
13511355
speculative_config=mtp_config) as llm:
13521356
task = GSM8K(self.MODEL_NAME)
13531357
task.evaluate(llm)
1358+
print_device_memory()
1359+
1360+
time.sleep(60)
1361+
print(f"================= print mem after 60s")
1362+
print_device_memory()
13541363

13551364
@pytest.mark.skip_less_device_memory(60000)
13561365
def test_bfloat16_2_model_mtp(self):
@@ -1406,6 +1415,10 @@ def test_bfloat16_4gpus(self, tp_size, pp_size, ep_size, mtp_nextn,
14061415
mtp_config = None
14071416
if mtp_nextn > 0:
14081417
mtp_config = MTPDecodingConfig(num_nextn_predict_layers=mtp_nextn)
1418+
1419+
#time.sleep(5)
1420+
print(f"================= print mem before testing")
1421+
print_device_memory()
14091422
with LLM(self.MODEL_PATH,
14101423
tensor_parallel_size=tp_size,
14111424
pipeline_parallel_size=pp_size,
@@ -1417,6 +1430,18 @@ def test_bfloat16_4gpus(self, tp_size, pp_size, ep_size, mtp_nextn,
14171430
speculative_config=mtp_config) as llm:
14181431
task = GSM8K(self.MODEL_NAME)
14191432
task.evaluate(llm)
1433+
print(f"================= print mem after testing")
1434+
print_device_memory()
1435+
1436+
#time.sleep(5)
1437+
print(f"================= print mem after testing outside")
1438+
print_device_memory()
1439+
1440+
print(f"++++++++++++++++++++++++++++++++++++++++\n\n\n")
1441+
1442+
#time.sleep(60)
1443+
#print(f"================= print mem after 60s")
1444+
#print_device_memory()
14201445

14211446
@skip_pre_hopper
14221447
@parametrize_with_ids("torch_compile", [False, True])
@@ -2263,6 +2288,13 @@ def test_nvfp4_multi_gpus(self, tp_size, pp_size, ep_size, mtp_nextn, fp8kv,
22632288
if moe_backend == "TRTLLM" and sm_version in (120, 121):
22642289
pytest.skip(f"{moe_backend} backend does not support SM 120 or 121")
22652290

2291+
import gc
2292+
gc.collect()
2293+
torch.cuda.empty_cache()
2294+
2295+
print(f"\n--- nvidia-smi start to test ---")
2296+
print_device_memory()
2297+
22662298
kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.70)
22672299
pytorch_config = dict(
22682300
disable_overlap_scheduler=not overlap_scheduler,
@@ -2297,9 +2329,19 @@ def test_nvfp4_multi_gpus(self, tp_size, pp_size, ep_size, mtp_nextn, fp8kv,
22972329
task = CnnDailymail(self.MODEL_NAME)
22982330
task.evaluate(llm)
22992331
# Commented out because GPQA takes too long to run
2300-
# task = GPQADiamond(self.MODEL_NAME)
2301-
# task.evaluate(llm,
2302-
# extra_evaluator_kwargs=dict(apply_chat_template=True))
2332+
task = GPQADiamond(self.MODEL_NAME)
2333+
task.evaluate(llm,
2334+
extra_evaluator_kwargs=dict(apply_chat_template=True))
2335+
print("=================================== test finishes")
2336+
print_device_memory()
2337+
2338+
import gc
2339+
gc.collect()
2340+
torch.cuda.empty_cache()
2341+
2342+
time.sleep(180)
2343+
print(f"\n--- nvidia-smi after testing after 180s ---")
2344+
print_device_memory()
23032345

23042346
@skip_pre_blackwell
23052347
@pytest.mark.parametrize(

tests/integration/defs/conftest.py

Lines changed: 49 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -2004,40 +2004,65 @@ def get_device_count():
20042004
return len(get_gpu_device_list())
20052005

20062006

2007-
def get_device_memory():
2008-
"get gpu memory"
2009-
memory = 0
2007+
def get_device_memory_str():
20102008
with tempfile.TemporaryDirectory() as temp_dirname:
20112009
suffix = ".exe" if is_windows() else ""
2012-
# TODO: Use NRSU because we can't assume nvidia-smi across all platforms.
20132010
cmd = " ".join([
2014-
"nvidia-smi" + suffix, "--query-gpu=memory.total",
2011+
"nvidia-smi" + suffix,
2012+
"--query-gpu=memory.total,memory.reserved,memory.used,memory.free",
20152013
"--format=csv,noheader"
20162014
])
2017-
# Try to get memory from nvidia-smi first, if failed, fallback to system memory from /proc/meminfo
2018-
# This fallback is needed for systems with unified memory (e.g. DGX Spark)
2015+
output = check_output(cmd, shell=True, cwd=temp_dirname)
2016+
return output.strip()
2017+
2018+
2019+
def get_device_memory():
2020+
"get gpu memory"
2021+
memory = 0
2022+
# Try to get memory from nvidia-smi first, if failed, fallback to system memory from /proc/meminfo
2023+
# This fallback is needed for systems with unified memory (e.g. DGX Spark)
2024+
try:
2025+
output = get_device_memory_str()
2026+
memory_str = output.strip().split()[0]
2027+
# Check if nvidia-smi returned a valid numeric value
2028+
if "N/A" in memory_str:
2029+
raise ValueError("nvidia-smi returned invalid memory info")
2030+
memory = int(memory_str)
2031+
except (sp.CalledProcessError, ValueError, IndexError):
2032+
# Fallback to system memory from /proc/meminfo (in kB, convert to MiB)
20192033
try:
2020-
output = check_output(cmd, shell=True, cwd=temp_dirname)
2021-
memory_str = output.strip().split()[0]
2022-
# Check if nvidia-smi returned a valid numeric value
2023-
if "N/A" in memory_str:
2024-
raise ValueError("nvidia-smi returned invalid memory info")
2025-
memory = int(memory_str)
2026-
except (sp.CalledProcessError, ValueError, IndexError):
2027-
# Fallback to system memory from /proc/meminfo (in kB, convert to MiB)
2028-
try:
2029-
with open("/proc/meminfo", "r") as f:
2030-
for line in f:
2031-
if line.startswith("MemTotal:"):
2032-
memory = int(
2033-
line.split()[1]) // 1024 # Convert kB to MiB
2034-
break
2035-
except:
2036-
memory = 8192 # Default 8GB if all else fails
2034+
with open("/proc/meminfo", "r") as f:
2035+
for line in f:
2036+
if line.startswith("MemTotal:"):
2037+
memory = int(
2038+
line.split()[1]) // 1024 # Convert kB to MiB
2039+
break
2040+
except:
2041+
memory = 8192 # Default 8GB if all else fails
20372042

20382043
return memory
20392044

20402045

2046+
def print_device_memory():
2047+
memory_str = get_device_memory_str()
2048+
print(f"Device Memory:\ntotal: reserved: used: free: \n{memory_str}")
2049+
2050+
mem_stats = torch.cuda.memory_stats()
2051+
torch_used_bytes = mem_stats["allocated_bytes.all.current"]
2052+
torch_used_bytes = mem_stats["reserved_bytes.all.current"]
2053+
print(
2054+
f"================================== torch mem stats: allocated {torch_used_bytes} reserved {torch_used_bytes}"
2055+
)
2056+
print(f"\n--- nvidia-smi in print_device_memory ---")
2057+
sp.run(["nvidia-smi"], check=False)
2058+
2059+
end, total_gpu_memory = torch.cuda.mem_get_info()
2060+
total_used_bytes = total_gpu_memory - end
2061+
print(
2062+
f"================================== torch mem info: free {end}, total {total_gpu_memory}, used {total_used_bytes}"
2063+
)
2064+
2065+
20412066
def pytest_addoption(parser):
20422067
parser.addoption(
20432068
"--test-list",

tests/integration/test_lists/test-db/l0_gb200_multi_nodes.yml

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -32,10 +32,10 @@ l0_gb200_multi_nodes:
3232
backend: pytorch
3333
tests:
3434
- accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_nvfp4_multi_gpus[latency] TIMEOUT (180)
35-
- accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_nvfp4_multi_gpus[throughput_tp8] TIMEOUT (180)
36-
- accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_nvfp4_multi_gpus[latency_trtllmgen] TIMEOUT (180)
37-
- accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_nvfp4_multi_gpus[latency_adp_lmtp] TIMEOUT (180)
38-
- accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_nvfp4_multi_gpus[latency_trtllmgen_adp_lmtp] TIMEOUT (180)
35+
- accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_nvfp4_multi_gpus[throughput_tp8] TIMEOUT (180) ISOLATION
36+
- accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_nvfp4_multi_gpus[latency_trtllmgen] TIMEOUT (180) ISOLATION
37+
- accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_nvfp4_multi_gpus[latency_adp_lmtp] TIMEOUT (180) ISOLATION
38+
- accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_nvfp4_multi_gpus[latency_trtllmgen_adp_lmtp] TIMEOUT (180) ISOLATION
3939
- accuracy/test_llm_api_pytorch.py::TestQwen3_235B_A22B::test_nvfp4[latency_moe_cutlass] TIMEOUT (90)
4040
- accuracy/test_llm_api_pytorch.py::TestQwen3_235B_A22B::test_nvfp4[latency_moe_trtllm] TIMEOUT (90)
4141
- accuracy/test_llm_api_pytorch.py::TestQwen3_235B_A22B::test_nvfp4[latency_moe_trtllm_attention_dp] TIMEOUT (90)

0 commit comments

Comments
 (0)