Reduced memory usage of use_python_runtime=True with the new API

cehongwang · cehongwang · commit 35d5861a4a12 · 2025-09-22T17:06:42.000Z
diff --git a/py/torch_tensorrt/dynamo/conversion/_TRTInterpreter.py b/py/torch_tensorrt/dynamo/conversion/_TRTInterpreter.py
@@ -65,7 +65,7 @@ class UnsupportedOperatorException(RuntimeError):
 
 
 class TRTInterpreterResult(NamedTuple):
-    serialized_engine: bytes
+    engine: trt.ICudaEngine | bytes
     input_names: Sequence[str]
     output_names: Sequence[str]
     weight_name_map: Optional[dict[Any, Any]]
@@ -731,6 +731,10 @@ def run(
                     if interpreter_result is not None:  # hit the cache
                         return interpreter_result  # type: ignore[no-any-return]
 
+        import psutil
+
+        print(psutil.Process().memory_info().rss / 1024 / 1024, "MB")
+        # breakpoint()
         self._construct_trt_network_def()
 
         if not self.compilation_settings.immutable_weights:
@@ -749,41 +753,62 @@ def run(
         self._create_timing_cache(
             builder_config, self.compilation_settings.timing_cache_path
         )
-        serialized_engine = self.builder.build_serialized_network(
+        import psutil
+
+        print(psutil.Process().memory_info().rss / 1024 / 1024, "MB")
+        # breakpoint()
+
+        cuda_engine = self.builder.build_engine_with_config(
             self.ctx.net, builder_config
         )
-        assert serialized_engine
 
         _LOGGER.info(
             f"Build TRT engine elapsed time: {datetime.now() - build_engine_start_time}"
         )
-        _LOGGER.info(f"TRT Engine uses: {serialized_engine.nbytes} bytes of Memory")
-
         self.ctx.clear_cpu_weights_reference_holder()
 
         self._save_timing_cache(
             builder_config, self.compilation_settings.timing_cache_path
         )
 
         # Engine caching only for refittable engines
-        if (
-            not self.compilation_settings.immutable_weights
-            and self.compilation_settings.cache_built_engines
-            and self.engine_cache is not None
-        ):
-            self._insert_engine_to_cache(hash_val, serialized_engine)
-
-        with io.BytesIO() as engine_bytes:
-            engine_bytes.write(serialized_engine)
-            engine_str = engine_bytes.getvalue()
-
-        return TRTInterpreterResult(
-            engine_str,
-            self._input_names,
-            self._output_names,
-            self.weight_name_map,
-            self.ctx.requires_output_allocator,
-        )
+        # if (
+        #     not self.compilation_settings.immutable_weights
+        #     and self.compilation_settings.cache_built_engines
+        #     and self.engine_cache is not None
+        # ):
+        #     self._insert_engine_to_cache(hash_val, serialized_engine)
+
+        print("After build_engine_with_config")
+        print(psutil.Process().memory_info().rss / 1024 / 1024, "MB")
+        # breakpoint()
+        assert cuda_engine
+        if self.compilation_settings.use_python_runtime:
+            return TRTInterpreterResult(
+                cuda_engine,
+                self._input_names,
+                self._output_names,
+                self.weight_name_map,
+                self.ctx.requires_output_allocator,
+            )
+        else:
+            print(psutil.Process().memory_info().rss / 1024 / 1024, "MB")
+            # breakpoint()
+            serialized_engine = cuda_engine.serialize()
+            _LOGGER.info(f"TRT Engine uses: {serialized_engine.nbytes} bytes of Memory")
+
+            with io.BytesIO() as engine_bytes:
+                engine_bytes.write(serialized_engine)
+                engine_str = engine_bytes.getvalue()
+            print(psutil.Process().memory_info().rss / 1024 / 1024, "MB")
+            # breakpoint()
+            return TRTInterpreterResult(
+                engine_str,
+                self._input_names,
+                self._output_names,
+                self.weight_name_map,
+                self.ctx.requires_output_allocator,
+            )
 
     def run_node(self, n: torch.fx.Node) -> torch.fx.Node:
         self._cur_node_name = get_node_name(n)
diff --git a/py/torch_tensorrt/dynamo/conversion/_conversion.py b/py/torch_tensorrt/dynamo/conversion/_conversion.py
@@ -89,12 +89,18 @@ def convert_module(
         module, inputs, settings, engine_cache=engine_cache
     )
 
-    rt_cls = PythonTorchTensorRTModule
-
     if ENABLED_FEATURES.torch_tensorrt_runtime and not settings.use_python_runtime:
         from torch_tensorrt.dynamo.runtime import TorchTensorRTModule
 
-        rt_cls = TorchTensorRTModule
+        return TorchTensorRTModule(
+            serialized_engine=interpreter_result.engine,
+            input_binding_names=list(interpreter_result.input_names),
+            output_binding_names=list(interpreter_result.output_names),
+            name=name,
+            settings=settings,
+            weight_name_map=interpreter_result.weight_name_map,
+            requires_output_allocator=interpreter_result.requires_output_allocator,
+        )
 
     elif (
         not ENABLED_FEATURES.torch_tensorrt_runtime and not settings.use_python_runtime
@@ -103,8 +109,8 @@ def convert_module(
             "Since Torch-TensorRT runtime is not available, using Python Runtime, some features may not be available"
         )
 
-    return rt_cls(
-        serialized_engine=interpreter_result.serialized_engine,
+    return PythonTorchTensorRTModule(
+        cuda_engine=interpreter_result.engine,
         input_binding_names=list(interpreter_result.input_names),
         output_binding_names=list(interpreter_result.output_names),
         name=name,
diff --git a/py/torch_tensorrt/dynamo/runtime/_PythonTorchTensorRTModule.py b/py/torch_tensorrt/dynamo/runtime/_PythonTorchTensorRTModule.py
@@ -15,7 +15,6 @@
 from torch_tensorrt.dynamo.debug._DebuggerConfig import DebuggerConfig
 from torch_tensorrt.dynamo.debug._supports_debugger import cls_supports_debugger
 from torch_tensorrt.dynamo.utils import DYNAMIC_DIM
-from torch_tensorrt.logging import TRT_LOGGER
 from torch_tensorrt.runtime._utils import (
     _is_switch_required,
     _select_rt_device,
@@ -123,7 +122,7 @@ class PythonTorchTensorRTModule(Module):  # type: ignore[misc]
 
     def __init__(
         self,
-        serialized_engine: Optional[bytes] = None,
+        cuda_engine: trt.ICudaEngine = None,
         input_binding_names: Optional[List[str]] = None,
         output_binding_names: Optional[List[str]] = None,
         *,
@@ -182,7 +181,7 @@ def __init__(
         # Unused currently - to be used by Dynamic Shape support implementation
         self.memory_pool = None
 
-        self.serialized_engine = serialized_engine
+        self.engine = cuda_engine
         self.input_names = (
             input_binding_names if input_binding_names is not None else []
         )
@@ -204,7 +203,6 @@ def __init__(
             else False
         )
         self.settings = settings
-        self.engine = None
         self.weight_name_map = weight_name_map
         self.target_platform = Platform.current_platform()
         self.runtime_states = TorchTRTRuntimeStates(
@@ -219,7 +217,7 @@ def __init__(
         self.output_allocator: Optional[DynamicOutputAllocator] = None
         self.use_output_allocator_outputs = False
 
-        if self.serialized_engine is not None and not self.settings.lazy_engine_init:
+        if self.engine is not None and not self.settings.lazy_engine_init:
             self.setup_engine()
 
     def get_streamable_device_memory_budget(self) -> Any:
@@ -265,8 +263,6 @@ def setup_engine(self) -> None:
         ), f"TensorRT engine was not built to target current platform (target: {self.target_platform}, current: {Platform.current_platform()})"
 
         self.initialized = True
-        runtime = trt.Runtime(TRT_LOGGER)
-        self.engine = runtime.deserialize_cuda_engine(self.serialized_engine)
         if self.settings.enable_weight_streaming:
             self.set_default_device_memory_budget()
         self.context = self.engine.create_execution_context()