Skip to content

Commit 653d653

Browse files
GPU and Host memory usage logging
1 parent b6cb276 commit 653d653

File tree

9 files changed

+425
-12
lines changed

9 files changed

+425
-12
lines changed

CHANGELOG.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,7 @@ limitations under the License.
2424
- new: Global context with scoped variables - temporary context variables
2525
- new: Added new context variables `INPLACE_OPTIMIZE_WORKSPACE_CONTEXT_KEY` and `INPLACE_OPTIMIZE_MODULE_GRAPH_ID_CONTEXT_KEY`
2626
- new: nav.bundle.save now has include and exclude patterns for fine grained files selection
27+
- new: GPU and Host memory usage logging
2728
- change: Install the TensorRT package for architectures other than x86_64
2829
- change: Disable conversion fallback for TensorRT paths and expose control option in custom config
2930
- fix: Correctness command relative tolerance formula

README.md

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -264,6 +264,11 @@ setting the following environment variable:
264264
NAVIGATOR_USE_MULTIPROCESSING=False
265265
```
266266

267+
## GPU and Host memory logging
268+
By default GPU and Host memory usage logs are saved in main `navigator.log` file.
269+
270+
Environment variable `NAVIGATOR_USE_SEPARATE_GPU_MEMORY_LOG_FILE=true` allows to redirect memory use logs to separate `gpu_memory.log` file for better log separation.
271+
267272
## Examples
268273

269274
We offer comprehensive, step-by-step [guides](examples) that showcase the utilization of the Triton Model Navigator’s

model_navigator/commands/performance/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
# Copyright (c) 2021-2023, NVIDIA CORPORATION. All rights reserved.
1+
# Copyright (c) 2021-2025, NVIDIA CORPORATION. All rights reserved.
22
#
33
# Licensed under the Apache License, Version 2.0 (the "License");
44
# you may not use this file except in compliance with the License.

model_navigator/commands/performance/nvml_handler.py

Lines changed: 63 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@
1313
# limitations under the License.
1414
"""NVML handler."""
1515

16-
from typing import ContextManager, Optional
16+
from typing import ContextManager, Dict, Optional
1717

1818
import numpy as np
1919
from pynvml import (
@@ -23,6 +23,8 @@
2323
nvmlDeviceGetComputeRunningProcesses,
2424
nvmlDeviceGetCount,
2525
nvmlDeviceGetHandleByIndex,
26+
nvmlDeviceGetMemoryInfo,
27+
nvmlDeviceGetName,
2628
nvmlInit,
2729
nvmlShutdown,
2830
)
@@ -43,7 +45,7 @@ def __enter__(self) -> "NvmlHandler":
4345
nvmlInit()
4446
self._nvml_exists = True
4547
except NVMLError as e:
46-
LOGGER.debug(f"Unable to initialize NVML: {str(e)}")
48+
LOGGER.debug("Unable to initialize NVML: {}", str(e))
4749
self._nvml_exists = False
4850

4951
return self
@@ -54,7 +56,7 @@ def __exit__(self, exc_type, exc_val, exc_tb) -> None:
5456
try:
5557
nvmlShutdown()
5658
except NVMLError as e:
57-
LOGGER.debug(f"Unable to shutdown NVML: {str(e)}")
59+
LOGGER.debug("Unable to shutdown NVML: {}", str(e))
5860
finally:
5961
self._nvml_exists = False
6062

@@ -75,7 +77,7 @@ def gpu_clock(self) -> Optional[float]:
7577
gpus_running += 1
7678
gpu_clocks_sum += nvmlDeviceGetClockInfo(handle, NVML_CLOCK_GRAPHICS)
7779
except NVMLError as e:
78-
LOGGER.debug(f"Unable to collect NVML data for GPU {i}: {str(e)}")
80+
LOGGER.debug("Unable to collect NVML data for GPU {}: {}", i, str(e))
7981
continue
8082

8183
if gpus_running == 0:
@@ -93,5 +95,61 @@ def gpu_count(self) -> int:
9395
try:
9496
return nvmlDeviceGetCount()
9597
except NVMLError as e:
96-
LOGGER.debug(f"Unable to collect NVML device count: {str(e)}")
98+
LOGGER.debug("Unable to collect NVML device count: {}", str(e))
9799
return 0
100+
101+
def get_gpu_memory_info(self) -> Dict[int, Dict[str, float]]:
102+
"""Get memory information for all available GPUs.
103+
104+
Returns:
105+
Dictionary with GPU indices as keys and memory information as values
106+
"""
107+
memory_info = {}
108+
109+
if not self._nvml_exists:
110+
return memory_info
111+
112+
for i in range(self.gpu_count):
113+
try:
114+
handle = nvmlDeviceGetHandleByIndex(i)
115+
mem_info = nvmlDeviceGetMemoryInfo(handle)
116+
117+
try:
118+
gpu_name = nvmlDeviceGetName(handle)
119+
if isinstance(gpu_name, bytes):
120+
gpu_name = gpu_name.decode("utf-8")
121+
except NVMLError:
122+
gpu_name = f"GPU {i}"
123+
124+
# Convert bytes to megabytes for consistency with other logging
125+
memory_used_mb = mem_info.used / (1024 * 1024)
126+
memory_total_mb = mem_info.total / (1024 * 1024)
127+
memory_free_mb = mem_info.free / (1024 * 1024)
128+
129+
gpu_info = {
130+
"index": i,
131+
"name": gpu_name,
132+
"memory_used_mb": memory_used_mb,
133+
"memory_total_mb": memory_total_mb,
134+
"memory_free_mb": memory_free_mb,
135+
}
136+
memory_info[i] = gpu_info
137+
except NVMLError as e:
138+
LOGGER.debug("Unable to collect memory info for GPU {}: {}", i, str(e))
139+
continue
140+
141+
return memory_info
142+
143+
@property
144+
def gpu_memory(self) -> Dict[int, Dict[str, float]]:
145+
"""Returns complete memory information for all GPUs.
146+
147+
Returns:
148+
Dictionary with GPU indices as keys and complete GPU memory information as values:
149+
- index: GPU index
150+
- name: GPU name
151+
- memory_used_mb: Used memory in MB
152+
- memory_total_mb: Total memory in MB
153+
- memory_free_mb: Free memory in MB
154+
"""
155+
return self.get_gpu_memory_info()

model_navigator/core/logger.py

Lines changed: 55 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -105,7 +105,10 @@ def navigator_record_predicate(record: Dict) -> bool:
105105

106106
def third_party_record_predicate(record: Dict) -> bool:
107107
"""Returns True if log emitted by 3rd party library."""
108-
return not navigator_record_predicate(record)
108+
# Import here to avoid circular imports
109+
from model_navigator.core.memory_logging import gpu_memory_record_predicate
110+
111+
return not navigator_record_predicate(record) and not gpu_memory_record_predicate(record)
109112

110113

111114
def forward_python_logging_to_loguru() -> None:
@@ -173,6 +176,12 @@ def reconfigure_logging_to_file(log_path: pathlib.Path) -> None:
173176
logger.remove() # remove existing configuration
174177
configure_logging_sink(log_path)
175178

179+
# configure GPU memory logging to a separate file
180+
from model_navigator.core.memory_logging import configure_gpu_memory_logging_sink
181+
182+
gpu_memory_log_path = log_path.parent / "gpu_memory.log"
183+
configure_gpu_memory_logging_sink(gpu_memory_log_path)
184+
176185
if OUTPUT_LOGS_FLAG in get_console_output():
177186
configure_logging_sink(sys.stderr)
178187

@@ -236,37 +245,78 @@ class LoggingContext(contextlib.AbstractContextManager):
236245
237246
Example of use:
238247
log_dir = pathlib.Path("/path/to/log/directory")
239-
with LoggingContext(log_dir=log_dir):
248+
with LoggingContext(log_dir=log_dir, command_name="ExampleCommand"):
240249
LOGGER.info("Log inside the context")
241250
"""
242251

243252
def __init__(
244253
self,
245254
*,
246255
log_dir: Optional[pathlib.Path] = None,
256+
command_name: Optional[str] = None,
257+
runner_cls=None,
258+
model_config=None,
247259
):
248260
"""Initialize the context.
249261
250262
Args:
251263
log_dir: Optional path to directory where log file is stored.
264+
command_name: Optional name of the command being executed.
265+
runner_cls: Optional runner class from execution unit.
266+
model_config: Optional model configuration from execution unit.
252267
"""
268+
self.sink_ids = None
269+
self.gpu_memory_sink_id = None
270+
self.command_name = command_name
271+
self.runner_cls = runner_cls
272+
self.model_config = model_config
273+
self.initial_memory_info = None
274+
self.initial_host_info = None
275+
253276
if log_dir:
277+
# Import here to avoid circular imports
278+
from model_navigator.core.memory_logging import configure_gpu_memory_logging_sink
279+
254280
log_dir.mkdir(parents=True, exist_ok=True)
255281
self.sink_ids = configure_logging_sink(log_dir / "format.log")
256-
else:
257-
self.sink_ids = None
282+
self.gpu_memory_sink_id = configure_gpu_memory_logging_sink(log_dir / "gpu_memory.log")
283+
284+
def __enter__(self):
285+
"""Enter the context and capture initial GPU and host memory usage without logging."""
286+
# Import here to avoid circular imports
287+
from model_navigator.core.memory_logging import get_memory_info
288+
289+
# Just capture memory info without logging
290+
self.initial_memory_info, self.initial_host_info = get_memory_info()
291+
return self
258292

259293
def __exit__(self, exc_type, exc_value, traceback): # noqa: F841
260-
"""Exit the context and clean handlers.
294+
"""Exit the context, log all memory usage in nested hierarchy, and clean handlers.
261295
262296
Args:
263297
exc_type: class of exception
264298
exc_value: type of exception
265299
traceback: traceback of exception
266300
"""
301+
# Import here to avoid circular imports
302+
from model_navigator.core.memory_logging import log_command_gpu_memory_usage
303+
304+
# Log GPU and host memory usage information
305+
log_command_gpu_memory_usage(
306+
initial_memory_info=self.initial_memory_info,
307+
initial_host_info=self.initial_host_info,
308+
command_name=self.command_name,
309+
runner_cls=self.runner_cls,
310+
model_config=self.model_config,
311+
)
312+
313+
# Remove logging sink handlers
267314
if self.sink_ids is not None:
268315
[logger.remove(sink_id) for sink_id in self.sink_ids]
269316

317+
if self.gpu_memory_sink_id is not None and self.gpu_memory_sink_id != 0:
318+
logger.remove(self.gpu_memory_sink_id)
319+
270320

271321
def log_dict(title: str, data: Dict):
272322
"""Log dictionary data with provided tittle.
Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,28 @@
1+
# Copyright (c) 2021-2025, NVIDIA CORPORATION. All rights reserved.
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
"""Memory logging package."""
15+
16+
from model_navigator.core.memory_logging.gpu_memory import (
17+
configure_gpu_memory_logging_sink,
18+
get_memory_info,
19+
gpu_memory_record_predicate,
20+
log_command_gpu_memory_usage,
21+
)
22+
23+
__all__ = [
24+
"configure_gpu_memory_logging_sink",
25+
"get_memory_info",
26+
"gpu_memory_record_predicate",
27+
"log_command_gpu_memory_usage",
28+
]

0 commit comments

Comments
 (0)