Cleared 2x+ dangling memory after compilation

cehongwang · cehongwang · commit 1e2e669b8a10 · 2025-09-29T23:55:47.000Z
diff --git a/py/torch_tensorrt/dynamo/_compiler.py b/py/torch_tensorrt/dynamo/_compiler.py
@@ -42,6 +42,7 @@
 )
 from torch_tensorrt.dynamo.utils import (
     deallocate_module,
+    get_cpu_memory_usage,
     get_flat_args_with_check,
     get_output_metadata,
     parse_graph_io,
@@ -675,7 +676,7 @@ def compile(
         "l2_limit_for_tiling": l2_limit_for_tiling,
         "offload_module_to_cpu": offload_module_to_cpu,
     }
-
+    logger.debug(f"CPU memory usage before lowering: {get_cpu_memory_usage()} MB")
     settings = CompilationSettings(**compilation_options)
     logger.info("Compilation Settings: %s\n", settings)
     exported_program = pre_export_lowering(exported_program, settings)
@@ -689,6 +690,7 @@ def compile(
 
     # Apply lowering on the graph module
     gm = post_lowering(gm, settings)
+    logger.debug(f"CPU memory usage after post_lowering: {get_cpu_memory_usage()} MB")
     logger.debug("Lowered Input graph: " + str(gm.graph))
 
     # Move the weights in the state_dict to CPU
@@ -698,6 +700,7 @@ def compile(
         logger.info(
             "The PyTorch model was moved to the CPU to allocate all GPU memory to TensorRT. To retain the model on the GPU, set offload_module_to_cpu=False"
         )
+        logger.debug(f"CPU memory usage after CPU offload: {get_cpu_memory_usage()} MB")
     else:
         remaining_memory, total_memory = torch.cuda.mem_get_info()
         if remaining_memory < total_memory // 2:
@@ -859,6 +862,9 @@ def preserve_module_specs(
     # Iterate over all components that can be accelerated
     # Generate the corresponding TRT Module for those
 
+    # Here we delete the frozen parameters from the graph module. Note this does not affect the submodules. We are going to delete the frozen parameters from the submodules in the convert_module function.
+    # This is done to release CPU memory.
+    [delattr(gm, attr) for attr in dir(gm) if attr.startswith("_frozen_param")]
     for name, _ in partitioned_module.named_children():
         submodule = getattr(partitioned_module, name)
         # filter on the GraphModule
diff --git a/py/torch_tensorrt/dynamo/conversion/_TRTInterpreter.py b/py/torch_tensorrt/dynamo/conversion/_TRTInterpreter.py
@@ -50,7 +50,12 @@
 from torch_tensorrt.dynamo.debug._DebuggerConfig import DebuggerConfig
 from torch_tensorrt.dynamo.debug._supports_debugger import cls_supports_debugger
 from torch_tensorrt.dynamo.observer import Observer
-from torch_tensorrt.dynamo.utils import DYNAMIC_DIM, deallocate_module, to_torch_device
+from torch_tensorrt.dynamo.utils import (
+    DYNAMIC_DIM,
+    deallocate_module,
+    get_cpu_memory_usage,
+    to_torch_device,
+)
 from torch_tensorrt.logging import TRT_LOGGER
 
 _LOGGER: logging.Logger = logging.getLogger(__name__)
@@ -729,7 +734,13 @@ def run(
                     if interpreter_result is not None:  # hit the cache
                         return interpreter_result  # type: ignore[no-any-return]
 
+        _LOGGER.debug(
+            f"CPU memory usage before network construction: {get_cpu_memory_usage()} MB"
+        )
         self._construct_trt_network_def()
+        _LOGGER.debug(
+            f"CPU memory usage after network construction: {get_cpu_memory_usage()} MB"
+        )
 
         if not self.compilation_settings.immutable_weights:
             self._save_weight_mapping()
@@ -747,12 +758,16 @@ def run(
         self._create_timing_cache(
             builder_config, self.compilation_settings.timing_cache_path
         )
-
+        _LOGGER.debug(
+            f"CPU memory usage before engine building: {get_cpu_memory_usage()} MB"
+        )
         cuda_engine = self.builder.build_engine_with_config(
             self.ctx.net, builder_config
         )
         assert cuda_engine
-
+        _LOGGER.debug(
+            f"CPU memory usage after engine building: {get_cpu_memory_usage()} MB"
+        )
         _LOGGER.info(
             f"Build TRT engine elapsed time: {datetime.now() - build_engine_start_time}"
         )
diff --git a/py/torch_tensorrt/dynamo/conversion/_conversion.py b/py/torch_tensorrt/dynamo/conversion/_conversion.py
@@ -14,7 +14,11 @@
     TRTInterpreterResult,
 )
 from torch_tensorrt.dynamo.runtime import PythonTorchTensorRTModule, TorchTensorRTModule
-from torch_tensorrt.dynamo.utils import get_output_dtypes
+from torch_tensorrt.dynamo.utils import (
+    get_cpu_memory_usage,
+    get_output_dtypes,
+    trim_memory,
+)
 
 logger = logging.getLogger(__name__)
 
@@ -29,7 +33,7 @@ def infer_module_output_dtypes(
     """
     outputs = [node for node in module.graph.nodes if node.op == "output"]
     outputs = outputs[0].args
-    return get_output_dtypes(outputs, truncate_double)  # type: ignore[no-any-return]
+    return get_output_dtypes(outputs, truncate_double)
 
 
 def interpret_module_to_result(
@@ -103,6 +107,13 @@ def convert_module(
             "Since Torch-TensorRT runtime is not available, using Python Runtime, some features may not be available"
         )
 
+    # Delete the frozen parameters from the module to release CPU memory
+    [delattr(module, attr) for attr in dir(module) if attr.startswith("_frozen_param")]
+    trim_memory()
+    logger.debug(
+        f"CPU memory usage after clearing frozen parameters and building memory: {get_cpu_memory_usage()} MB"
+    )
+
     return rt_cls(
         cuda_engine=interpreter_result.engine,
         input_binding_names=list(interpreter_result.input_names),
diff --git a/py/torch_tensorrt/dynamo/debug/_Debugger.py b/py/torch_tensorrt/dynamo/debug/_Debugger.py
@@ -197,6 +197,7 @@ def get_logging_config(self, log_level: Optional[int] = None) -> dict[str, Any]:
                 "class": "logging.FileHandler",
                 "filename": f"{self.cfg.logging_dir}/torch_tensorrt_logging.log",
                 "formatter": "standard",
+                "mode": "w",  # This will clear the previous content
             }
             config["loggers"][""]["handlers"].append("file")
         return config
diff --git a/py/torch_tensorrt/dynamo/utils.py b/py/torch_tensorrt/dynamo/utils.py
@@ -1,5 +1,6 @@
 from __future__ import annotations
 
+import ctypes
 import gc
 import logging
 import warnings
@@ -8,6 +9,7 @@
 from typing import Any, Callable, Dict, List, Optional, Sequence, Tuple, Union
 
 import numpy as np
+import psutil
 import sympy
 import tensorrt as trt
 import torch
@@ -858,3 +860,12 @@ def is_thor() -> bool:
     if torch.cuda.get_device_capability() in [(11, 0)]:
         return True
     return False
+
+
+def get_cpu_memory_usage() -> Any:
+    return psutil.Process().memory_info().rss / 1024 / 1024
+
+
+def trim_memory() -> Any:
+    libc = ctypes.CDLL("libc.so.6")
+    return libc.malloc_trim(0)

Original file line number	Diff line number	Diff line change
`@@ -197,6 +197,7 @@ def get_logging_config(self, log_level: Optional[int] = None) -> dict[str, Any]:`
`197`	`197`	`"class": "logging.FileHandler",`
`198`	`198`	`"filename": f"{self.cfg.logging_dir}/torch_tensorrt_logging.log",`
`199`	`199`	`"formatter": "standard",`
	`200`	`+ "mode": "w", # This will clear the previous content`
`200`	`201`	`}`
`201`	`202`	`config["loggers"][""]["handlers"].append("file")`
`202`	`203`	`return config`