GPU and Host memory usage logging

kacper-kleczewski · kacper-kleczewski · commit 653d653649e6 · 2025-03-26T04:37:32.000-07:00
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -24,6 +24,7 @@ limitations under the License.
 - new: Global context with scoped variables - temporary context variables
 - new: Added new context variables `INPLACE_OPTIMIZE_WORKSPACE_CONTEXT_KEY` and `INPLACE_OPTIMIZE_MODULE_GRAPH_ID_CONTEXT_KEY`
 - new: nav.bundle.save now has include and exclude patterns for fine grained files selection
+- new: GPU and Host memory usage logging
 - change: Install the TensorRT package for architectures other than x86_64
 - change: Disable conversion fallback for TensorRT paths and expose control option in custom config
 - fix: Correctness command relative tolerance formula
diff --git a/README.md b/README.md
@@ -264,6 +264,11 @@ setting the following environment variable:
 NAVIGATOR_USE_MULTIPROCESSING=False
 ```
 
+## GPU and Host memory logging
+By default GPU and Host memory usage logs are saved in main `navigator.log` file.
+
+Environment variable `NAVIGATOR_USE_SEPARATE_GPU_MEMORY_LOG_FILE=true` allows to redirect memory use logs to separate `gpu_memory.log` file for better log separation.
+
 ## Examples
 
 We offer comprehensive, step-by-step [guides](examples) that showcase the utilization of the Triton Model Navigator’s
diff --git a/model_navigator/commands/performance/__init__.py b/model_navigator/commands/performance/__init__.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2021-2023, NVIDIA CORPORATION. All rights reserved.
+# Copyright (c) 2021-2025, NVIDIA CORPORATION. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
diff --git a/model_navigator/commands/performance/nvml_handler.py b/model_navigator/commands/performance/nvml_handler.py
@@ -13,7 +13,7 @@
 # limitations under the License.
 """NVML handler."""
 
-from typing import ContextManager, Optional
+from typing import ContextManager, Dict, Optional
 
 import numpy as np
 from pynvml import (
@@ -23,6 +23,8 @@
     nvmlDeviceGetComputeRunningProcesses,
     nvmlDeviceGetCount,
     nvmlDeviceGetHandleByIndex,
+    nvmlDeviceGetMemoryInfo,
+    nvmlDeviceGetName,
     nvmlInit,
     nvmlShutdown,
 )
@@ -43,7 +45,7 @@ def __enter__(self) -> "NvmlHandler":
             nvmlInit()
             self._nvml_exists = True
         except NVMLError as e:
-            LOGGER.debug(f"Unable to initialize NVML: {str(e)}")
+            LOGGER.debug("Unable to initialize NVML: {}", str(e))
             self._nvml_exists = False
 
         return self
@@ -54,7 +56,7 @@ def __exit__(self, exc_type, exc_val, exc_tb) -> None:
             try:
                 nvmlShutdown()
             except NVMLError as e:
-                LOGGER.debug(f"Unable to shutdown NVML: {str(e)}")
+                LOGGER.debug("Unable to shutdown NVML: {}", str(e))
             finally:
                 self._nvml_exists = False
 
@@ -75,7 +77,7 @@ def gpu_clock(self) -> Optional[float]:
                     gpus_running += 1
                     gpu_clocks_sum += nvmlDeviceGetClockInfo(handle, NVML_CLOCK_GRAPHICS)
             except NVMLError as e:
-                LOGGER.debug(f"Unable to collect NVML data for GPU {i}: {str(e)}")
+                LOGGER.debug("Unable to collect NVML data for GPU {}: {}", i, str(e))
                 continue
 
         if gpus_running == 0:
@@ -93,5 +95,61 @@ def gpu_count(self) -> int:
         try:
             return nvmlDeviceGetCount()
         except NVMLError as e:
-            LOGGER.debug(f"Unable to collect NVML device count: {str(e)}")
+            LOGGER.debug("Unable to collect NVML device count: {}", str(e))
             return 0
+
+    def get_gpu_memory_info(self) -> Dict[int, Dict[str, float]]:
+        """Get memory information for all available GPUs.
+
+        Returns:
+            Dictionary with GPU indices as keys and memory information as values
+        """
+        memory_info = {}
+
+        if not self._nvml_exists:
+            return memory_info
+
+        for i in range(self.gpu_count):
+            try:
+                handle = nvmlDeviceGetHandleByIndex(i)
+                mem_info = nvmlDeviceGetMemoryInfo(handle)
+
+                try:
+                    gpu_name = nvmlDeviceGetName(handle)
+                    if isinstance(gpu_name, bytes):
+                        gpu_name = gpu_name.decode("utf-8")
+                except NVMLError:
+                    gpu_name = f"GPU {i}"
+
+                # Convert bytes to megabytes for consistency with other logging
+                memory_used_mb = mem_info.used / (1024 * 1024)
+                memory_total_mb = mem_info.total / (1024 * 1024)
+                memory_free_mb = mem_info.free / (1024 * 1024)
+
+                gpu_info = {
+                    "index": i,
+                    "name": gpu_name,
+                    "memory_used_mb": memory_used_mb,
+                    "memory_total_mb": memory_total_mb,
+                    "memory_free_mb": memory_free_mb,
+                }
+                memory_info[i] = gpu_info
+            except NVMLError as e:
+                LOGGER.debug("Unable to collect memory info for GPU {}: {}", i, str(e))
+                continue
+
+        return memory_info
+
+    @property
+    def gpu_memory(self) -> Dict[int, Dict[str, float]]:
+        """Returns complete memory information for all GPUs.
+
+        Returns:
+            Dictionary with GPU indices as keys and complete GPU memory information as values:
+            - index: GPU index
+            - name: GPU name
+            - memory_used_mb: Used memory in MB
+            - memory_total_mb: Total memory in MB
+            - memory_free_mb: Free memory in MB
+        """
+        return self.get_gpu_memory_info()
diff --git a/model_navigator/core/logger.py b/model_navigator/core/logger.py
@@ -105,7 +105,10 @@ def navigator_record_predicate(record: Dict) -> bool:
 
 def third_party_record_predicate(record: Dict) -> bool:
     """Returns True if log emitted by 3rd party library."""
-    return not navigator_record_predicate(record)
+    # Import here to avoid circular imports
+    from model_navigator.core.memory_logging import gpu_memory_record_predicate
+
+    return not navigator_record_predicate(record) and not gpu_memory_record_predicate(record)
 
 
 def forward_python_logging_to_loguru() -> None:
@@ -173,6 +176,12 @@ def reconfigure_logging_to_file(log_path: pathlib.Path) -> None:
     logger.remove()  # remove existing configuration
     configure_logging_sink(log_path)
 
+    # configure GPU memory logging to a separate file
+    from model_navigator.core.memory_logging import configure_gpu_memory_logging_sink
+
+    gpu_memory_log_path = log_path.parent / "gpu_memory.log"
+    configure_gpu_memory_logging_sink(gpu_memory_log_path)
+
     if OUTPUT_LOGS_FLAG in get_console_output():
         configure_logging_sink(sys.stderr)
 
@@ -236,37 +245,78 @@ class LoggingContext(contextlib.AbstractContextManager):
 
     Example of use:
         log_dir = pathlib.Path("/path/to/log/directory")
-        with LoggingContext(log_dir=log_dir):
+        with LoggingContext(log_dir=log_dir, command_name="ExampleCommand"):
             LOGGER.info("Log inside the context")
     """
 
     def __init__(
         self,
         *,
         log_dir: Optional[pathlib.Path] = None,
+        command_name: Optional[str] = None,
+        runner_cls=None,
+        model_config=None,
     ):
         """Initialize the context.
 
         Args:
             log_dir: Optional path to directory where log file is stored.
+            command_name: Optional name of the command being executed.
+            runner_cls: Optional runner class from execution unit.
+            model_config: Optional model configuration from execution unit.
         """
+        self.sink_ids = None
+        self.gpu_memory_sink_id = None
+        self.command_name = command_name
+        self.runner_cls = runner_cls
+        self.model_config = model_config
+        self.initial_memory_info = None
+        self.initial_host_info = None
+
         if log_dir:
+            # Import here to avoid circular imports
+            from model_navigator.core.memory_logging import configure_gpu_memory_logging_sink
+
             log_dir.mkdir(parents=True, exist_ok=True)
             self.sink_ids = configure_logging_sink(log_dir / "format.log")
-        else:
-            self.sink_ids = None
+            self.gpu_memory_sink_id = configure_gpu_memory_logging_sink(log_dir / "gpu_memory.log")
+
+    def __enter__(self):
+        """Enter the context and capture initial GPU and host memory usage without logging."""
+        # Import here to avoid circular imports
+        from model_navigator.core.memory_logging import get_memory_info
+
+        # Just capture memory info without logging
+        self.initial_memory_info, self.initial_host_info = get_memory_info()
+        return self
 
     def __exit__(self, exc_type, exc_value, traceback):  # noqa: F841
-        """Exit the context and clean handlers.
+        """Exit the context, log all memory usage in nested hierarchy, and clean handlers.
 
         Args:
             exc_type: class of exception
             exc_value: type of exception
             traceback: traceback of exception
         """
+        # Import here to avoid circular imports
+        from model_navigator.core.memory_logging import log_command_gpu_memory_usage
+
+        # Log GPU and host memory usage information
+        log_command_gpu_memory_usage(
+            initial_memory_info=self.initial_memory_info,
+            initial_host_info=self.initial_host_info,
+            command_name=self.command_name,
+            runner_cls=self.runner_cls,
+            model_config=self.model_config,
+        )
+
+        # Remove logging sink handlers
         if self.sink_ids is not None:
             [logger.remove(sink_id) for sink_id in self.sink_ids]
 
+        if self.gpu_memory_sink_id is not None and self.gpu_memory_sink_id != 0:
+            logger.remove(self.gpu_memory_sink_id)
+
 
 def log_dict(title: str, data: Dict):
     """Log dictionary data with provided tittle.
diff --git a/model_navigator/core/memory_logging/__init__.py b/model_navigator/core/memory_logging/__init__.py
@@ -0,0 +1,28 @@
+# Copyright (c) 2021-2025, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Memory logging package."""
+
+from model_navigator.core.memory_logging.gpu_memory import (
+    configure_gpu_memory_logging_sink,
+    get_memory_info,
+    gpu_memory_record_predicate,
+    log_command_gpu_memory_usage,
+)
+
+__all__ = [
+    "configure_gpu_memory_logging_sink",
+    "get_memory_info",
+    "gpu_memory_record_predicate",
+    "log_command_gpu_memory_usage",
+]
diff --git a/model_navigator/core/memory_logging/gpu_memory.py b/model_navigator/core/memory_logging/gpu_memory.py
diff --git a/model_navigator/pipelines/pipeline.py b/model_navigator/pipelines/pipeline.py
diff --git a/model_navigator/utils/environment.py b/model_navigator/utils/environment.py

Original file line number	Diff line number	Diff line change
`@@ -1,4 +1,4 @@`
`1`		`-# Copyright (c) 2021-2023, NVIDIA CORPORATION. All rights reserved.`
	`1`	`+# Copyright (c) 2021-2025, NVIDIA CORPORATION. All rights reserved.`
`2`	`2`	`#`
`3`	`3`	`# Licensed under the Apache License, Version 2.0 (the "License");`
`4`	`4`	`# you may not use this file except in compliance with the License.`