Torch-TRT save through torch.export.save

jkosek · jkosek · commit c77b0575761b · 2025-04-02T01:06:58.000-07:00
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -27,6 +27,7 @@ limitations under the License.
 - new: GPU and Host memory usage logging
 - change: Install the TensorRT package for architectures other than x86_64
 - change: Disable conversion fallback for TensorRT paths and expose control option in custom config
+- change: Use torch.export.save for Torch-TRT model serialization
 - fix: Correctness command relative tolerance formula
 - fix: Memory management during export and conversion process for Torch
 
diff --git a/model_navigator/commands/convert/converters/ep2torchtrt.py b/model_navigator/commands/convert/converters/ep2torchtrt.py
@@ -14,19 +14,21 @@
 """Convert ExportedProgram model to Torch-TensorRT model."""
 
 import pathlib
-from typing import Any, Dict, Optional
+from typing import Any, Dict, List, Optional
 
 import fire
 import numpy as np
 import torch  # pytype: disable=import-error
 from loguru import logger
+from packaging.version import Version
 
 from model_navigator.configuration import TensorRTPrecision, TensorRTPrecisionMode
 from model_navigator.configuration.device import map_device_string
-from model_navigator.core.dataloader import load_samples
+from model_navigator.core.dataloader import expand_sample, load_samples
+from model_navigator.core.logger import LOGGER
 from model_navigator.core.tensor import TensorMetadata
 from model_navigator.frameworks.tensorrt import utils as tensorrt_utils
-from model_navigator.frameworks.tensorrt.timing_tactics import TimingCacheManager, trt_cache_inplace_cache_dir
+from model_navigator.frameworks.tensorrt.timing_tactics import TimingCacheManager
 from model_navigator.utils.common import numpy_to_torch_dtype
 
 
@@ -59,9 +61,10 @@ def convert(
     exported_model_path: str,
     converted_model_path: str,
     input_metadata: Dict[str, Any],
-    shapes: Dict[str, Dict[str, int]],
+    shapes: Dict[str, Dict[str, List[int]]],
     batch_dim: Optional[int],
     max_workspace_size: int,
+    pickle_protocol: int,
     precision: str,
     precision_mode: str,
     target_device: str,
@@ -82,12 +85,13 @@ def convert(
             and respective values.
         batch_dim: Batch dimension.
         max_workspace_size: Maximum workspace size in bytes.
+        pickle_protocol: Pickle protocol used during model serialization
         precision: TensorRT precision. Could be "fp16" or "fp32".
         precision_mode: TensorRT precision mode.
         target_device: Device on which perform the conversion
         debug: If True print debug logs.
         custom_args: Dictionary with passthrough parameters. For available arguments check PyTorch
-                     documentation: https://pytorch.org/TensorRT/py_api/torch_tensorrt.html
+             documentation: https://pytorch.org/TensorRT/py_api/torch_tensorrt.html
         timing_cache_dir: Directory to save timing cache. Defaults to None which means it will be saved in workspace root.
         model_name: Model name for the timing cache. Defaults to None which means it will be named after the model file.
         navigator_workspace: Model Navigator workspace path. When None use current workdir. Defaults to None.
@@ -106,9 +110,34 @@ def convert(
 
     conversion_sample = load_samples("conversion_samples", navigator_workspace, batch_dim)[0]
 
+    if batch_dim is None:
+        max_batch_size = None
+        expanded_sample = expand_sample(conversion_sample, input_metadata, batch_dim=batch_dim, batch_size=None)
+    else:
+        # WAR to make data dynamic
+        max_batch_size = list(shapes.values())[0]["max"][0]
+        batch_size = 2 if max_batch_size > 1 else 1  # select the minimum value to expand samples
+        expanded_sample = expand_sample(conversion_sample, input_metadata, batch_dim=batch_dim, batch_size=batch_size)
+
+    dummy_input = {n: torch.from_numpy(val).to(target_device) for n, val in expanded_sample.items()}
+    dummy_input = input_metadata.unflatten_sample(dummy_input, wrap_input=False)
+
+    if not isinstance(dummy_input, tuple):
+        dummy_input = (dummy_input,)
+    if not isinstance(dummy_input[-1], dict):
+        dummy_input = (*dummy_input, {})
+    *args, kwargs = dummy_input
+
     input_dtypes = [numpy_to_torch_dtype(np.dtype(input_dtype)) for input_dtype in input_dtypes]
     model_input_shapes = []
-    for input_shapes, input_dtype in zip(shapes.values(), input_dtypes):
+    dynamic_shapes = []
+    for input_name, input_dtype in zip(shapes.keys(), input_dtypes):
+        input_shapes = shapes.get(input_name)
+        tensor_metadata = input_metadata.get(input_name)
+        if not tensor_metadata or not input_shapes:
+            LOGGER.warning(f"Input metadata or input shapes for input {input_name} is not found")
+            continue
+
         model_input_shapes.append(
             torch_tensorrt.Input(
                 min_shape=input_shapes["min"],
@@ -118,6 +147,18 @@ def convert(
             )
         )
 
+        dynamic_shape_map = {}
+        if max_batch_size is not None and max_batch_size > 1 and len(tensor_metadata.shape) > 0:
+            dynamic_shape_map[0] = torch.export.Dim(f"{input_name}_batch", min=1, max=max_batch_size)
+
+        for idx in range(1, len(input_shapes["min"])):
+            min_value = input_shapes["min"][idx]
+            max_value = input_shapes["max"][idx]
+            if min_value != max_value:
+                dynamic_shape_map[idx] = torch.export.Dim(f"{input_name}__{idx}", min=min_value, max=max_value)
+
+        dynamic_shapes.append(dynamic_shape_map)
+
     exported_model_path = pathlib.Path(exported_model_path)
     if not exported_model_path.is_absolute():
         exported_model_path = navigator_workspace / exported_model_path
@@ -134,19 +175,14 @@ def convert(
 
     target_device = map_device_string(target_device)
 
-    # saving timing cache in model_navigator workspace or ...
-    timing_cache = trt_cache_inplace_cache_dir()
-    if timing_cache_dir is not None:
-        timing_cache = pathlib.Path(timing_cache_dir)
-
     with TimingCacheManager(model_name=model_name, cache_path=timing_cache_dir) as timing_cache:
         timing_cache_path = timing_cache.as_posix() if timing_cache else None
 
         # reusing custom_args as dynamo.compile has a default cache path argument
         if timing_cache_path is not None:
             custom_args["timing_cache_path"] = timing_cache_path
 
-        tr_model_compiled = torch_tensorrt.dynamo.compile(
+        trt_model_compiled = torch_tensorrt.dynamo.compile(
             exported_program=model,
             inputs=model_input_shapes,
             workspace_size=max_workspace_size,
@@ -155,15 +191,24 @@ def convert(
             **custom_args,
         )
 
+        exported_model = torch.export.export(
+            trt_model_compiled,
+            args=tuple(args),
+            kwargs=kwargs,
+            dynamic_shapes=dynamic_shapes,
+            strict=False,
+        )
+
     converted_model_path = pathlib.Path(converted_model_path)
     if not converted_model_path.is_absolute():
         converted_model_path = navigator_workspace / converted_model_path
 
-    inputs = []
-    for _, val in conversion_sample.items():
-        inputs.append(torch.from_numpy(val).to(target_device))
+    save_kwargs = {}
+    if Version(torch.__version__) > Version("2.6"):
+        LOGGER.info("Using pickle protocol {}.", pickle_protocol)
+        save_kwargs["pickle_protocol"] = pickle_protocol
 
-    torch_tensorrt.save(tr_model_compiled, converted_model_path.as_posix(), inputs=inputs)
+    torch.export.save(exported_model, converted_model_path.as_posix(), **save_kwargs)
 
 
 if __name__ == "__main__":
diff --git a/model_navigator/commands/convert/torch.py b/model_navigator/commands/convert/torch.py
@@ -114,6 +114,7 @@ def _run(
         precision: TensorRTPrecision,
         precision_mode: TensorRTPrecisionMode,
         max_workspace_size: int,
+        pickle_protocol: int,
         verbose: bool,
         debug: bool,
         dataloader_trt_profile: TensorRTProfile,
@@ -138,6 +139,7 @@ def _run(
             precision: TensorRTPrecision.
             precision_mode: TensorRT precision mode.
             max_workspace_size: TensorRT maximum workspace size.
+            pickle_protocol: Pickle protocol for model serialization.
             verbose: If True verbose logging.
             debug: If True print debug logs.
             dataloader_trt_profile: Dataloader TensorRT profile.
@@ -183,6 +185,7 @@ def get_args(max_batch_size=None):
                 "max_workspace_size": max_workspace_size,
                 "precision": precision.value,
                 "precision_mode": precision_mode.value,
+                "pickle_protocol": pickle_protocol,
                 "navigator_workspace": workspace.path.as_posix(),
                 "target_device": target_device.value,
                 "custom_args": custom_args,
diff --git a/model_navigator/commands/export/exporters/torch2dynamo_onnx.py b/model_navigator/commands/export/exporters/torch2dynamo_onnx.py
@@ -124,15 +124,15 @@ def expand_batch_dim(tensor, batch_dim, max_batch_size):
         if not tensor_metadata:
             continue
 
-        dynamic_shapes_ = {}
+        dynamic_shape_map = {}
         if max_batch_size is not None and max_batch_size > 1 and len(tensor_metadata.shape) > 0:
-            dynamic_shapes_[0] = torch.export.Dim("batch", min=1, max=max_batch_size)
+            dynamic_shape_map[0] = torch.export.Dim("batch", min=1, max=max_batch_size)
 
         for idx in range(1, len(spec_.min)):
             if spec_.min[idx] != spec_.max[idx]:
-                dynamic_shapes_[idx] = torch.export.Dim(f"{name}__{idx}", min=spec_.min[idx], max=spec_.max[idx])
+                dynamic_shape_map[idx] = torch.export.Dim(f"{name}__{idx}", min=spec_.min[idx], max=spec_.max[idx])
 
-        dynamic_shapes.append(dynamic_shapes_)
+        dynamic_shapes.append(dynamic_shape_map)
 
     try:
         exported_model = torch.onnx.export(
diff --git a/model_navigator/commands/export/exporters/torch2exportedprogram.py b/model_navigator/commands/export/exporters/torch2exportedprogram.py
@@ -77,10 +77,8 @@ def export(
         max_batch_size = None
         expanded_sample = expand_sample(conversion_sample, input_metadata, batch_dim=batch_dim, batch_size=None)
     else:
-        # WAR for to big batch size value
-        max_batch_size = max_batch_size if max_batch_size < 2048 else max_batch_size - 1
         # WAR to make data dynamic
-        batch_size = min(2, max_batch_size)  # select the minimum value to expand samples
+        batch_size = 2 if max_batch_size > 1 else 1  # select the minimum value to expand samples
         expanded_sample = expand_sample(conversion_sample, input_metadata, batch_dim=batch_dim, batch_size=batch_size)
 
     dummy_input = {n: torch.from_numpy(val).to(target_device) for n, val in expanded_sample.items()}
@@ -103,15 +101,15 @@ def export(
         if not tensor_metadata:
             continue
 
-        dynamic_shapes_ = {}
+        dynamic_shape_map = {}
         if max_batch_size is not None and max_batch_size > 1 and len(tensor_metadata.shape) > 0:
-            dynamic_shapes_[0] = torch.export.Dim(f"{name}_batch", min=1, max=max_batch_size)
+            dynamic_shape_map[0] = torch.export.Dim(f"{name}_batch", min=1, max=max_batch_size)
 
         for idx in range(1, len(spec_.min)):
             if spec_.min[idx] != spec_.max[idx]:
-                dynamic_shapes_[idx] = torch.export.Dim(f"{name}__{idx}", min=spec_.min[idx], max=spec_.max[idx])
+                dynamic_shape_map[idx] = torch.export.Dim(f"{name}__{idx}", min=spec_.min[idx], max=spec_.max[idx])
 
-        dynamic_shapes.append(dynamic_shapes_)
+        dynamic_shapes.append(dynamic_shape_map)
 
     try:
         exported_model = torch.export.export(
diff --git a/model_navigator/commands/export/exporters/torch2torchscript.py b/model_navigator/commands/export/exporters/torch2torchscript.py
@@ -24,6 +24,7 @@
 from model_navigator.core.dataloader import load_samples
 from model_navigator.core.tensor import TensorMetadata
 from model_navigator.exceptions import ModelNavigatorUserInputError
+from model_navigator.frameworks.torch.utils import offload_torch_model_to_cpu
 from model_navigator.utils.common import numpy_to_torch_dtype
 
 
@@ -122,6 +123,8 @@ def export(
 
     torch.jit.save(script_module, exported_model_path.as_posix())
 
+    offload_torch_model_to_cpu(script_module)
+
 
 if __name__ == "__main__":
     fire.Fire(export)
diff --git a/model_navigator/configuration/__init__.py b/model_navigator/configuration/__init__.py
@@ -49,6 +49,7 @@
     DEFAULT_MIN_SEGMENT_SIZE,
     DEFAULT_MIN_TRIALS,
     DEFAULT_ONNX_OPSET,
+    DEFAULT_PICKLE_PROTOCOL_TORCHTRT,
     DEFAULT_STABILITY_PERCENTAGE,
     DEFAULT_STABILIZATION_WINDOWS,
     DEFAULT_THROUGHPUT_BACKOFF_LIMIT,
@@ -844,6 +845,7 @@ class TorchTensorRTConfig(CustomConfigForTensorRT):
     """Torch custom config used for TensorRT TorchScript conversion."""
 
     max_workspace_size: Optional[int] = DEFAULT_MAX_WORKSPACE_SIZE_TORCHTRT
+    pickle_protocol: int = DEFAULT_PICKLE_PROTOCOL_TORCHTRT
 
     @property
     def format(self) -> Format:
diff --git a/model_navigator/configuration/constants.py b/model_navigator/configuration/constants.py
@@ -37,6 +37,7 @@
 DEFAULT_MAX_WORKSPACE_SIZE_TORCHTRT = (
     0  # Default to use full device memory https://pytorch.org/TensorRT/py_api/dynamo.html
 )
+DEFAULT_PICKLE_PROTOCOL_TORCHTRT = 5
 DEFAULT_MIN_SEGMENT_SIZE = 3
 DEFAULT_TENSORRT_MAX_DIMENSION_SIZE = 2**31 - 1
 OPT_MAX_SHAPE_RATIO = 4 / 5
diff --git a/model_navigator/configuration/model/model_config.py b/model_navigator/configuration/model/model_config.py
@@ -609,6 +609,7 @@ def __init__(
         precision: TensorRTPrecision,
         precision_mode: TensorRTPrecisionMode,
         max_workspace_size: int,
+        pickle_protocol: int,
         trt_profiles: Optional[List[TensorRTProfile]] = None,
         parent: Optional[ModelConfig] = None,
         custom_args: Optional[Dict[str, Any]] = None,
@@ -622,6 +623,7 @@ def __init__(
             precision: TensorRT model precision
             precision_mode: Mode how the precision flags are combined
             max_workspace_size: The maximum GPU memory the model can use temporarily during execution
+            pickle_protocol: Pickle protocol used during model serialization
             trt_profiles: TensorRT profiles
             custom_args: Custom arguments passed to Torch TensorRT conversion
             device: runtime device e.g. "cuda:0"
@@ -631,6 +633,7 @@ def __init__(
         self.precision = precision
         self.precision_mode = precision_mode
         self.max_workspace_size = max_workspace_size
+        self.pickle_protocol = pickle_protocol
         self.trt_profiles = trt_profiles
         self.custom_args = custom_args
         self.runner_config = DeviceRunnerConfig(device=device)
@@ -648,6 +651,7 @@ def _from_dict(cls, data_dict: Dict):
             precision=cls._parse_string(TensorRTPrecision, data_dict.get("precision")),
             precision_mode=cls._parse_string(TensorRTPrecisionMode, data_dict.get("precision_mode")),
             max_workspace_size=cls._parse_string(int, data_dict.get("max_workspace_size")),
+            pickle_protocol=cls._parse_string(int, data_dict.get("pickle_protocol")),
             trt_profiles=trt_profiles,
             device=data_dict.get("device"),
             conversion_fallback=data_dict.get("conversion_fallback", False),
diff --git a/model_navigator/configuration/model/model_config_builder.py b/model_navigator/configuration/model/model_config_builder.py
@@ -308,6 +308,7 @@ def get_torch_trt_config(
                     precision=precision,
                     precision_mode=torch_trt_config.precision_mode,
                     max_workspace_size=torch_trt_config.max_workspace_size,
+                    pickle_protocol=torch_trt_config.pickle_protocol,
                     trt_profiles=torch_trt_config.trt_profiles,
                     custom_args=torch_trt_config.custom_args,
                     device=torch_trt_config.device,
diff --git a/model_navigator/core/constants.py b/model_navigator/core/constants.py
@@ -17,6 +17,6 @@
 
 # Versions
 NAVIGATOR_VERSION = __version__
-NAVIGATOR_PACKAGE_VERSION = "0.3.1"
+NAVIGATOR_PACKAGE_VERSION = "0.3.2"
 NAVIGATOR_INPLACE_OPTIMIZE_VERSION = "0.1.0"
 NAVIGATOR_INPLACE_PROFILE_VERSION = "0.1.0"
diff --git a/model_navigator/inplace/config.py b/model_navigator/inplace/config.py
@@ -34,7 +34,7 @@
 
 DEFAULT_CACHE_DIR = pathlib.Path.home() / ".cache" / "model_navigator"
 DEFAULT_MIN_NUM_SAMPLES = 100
-DEFAULT_MAX_NUM_SAMPLES_STORED = 1
+DEFAULT_MAX_NUM_SAMPLES_STORED = 10
 
 
 def inplace_cache_dir() -> pathlib.Path:
diff --git a/model_navigator/package/status.py b/model_navigator/package/status.py
@@ -38,8 +38,12 @@
     TensorRTProfile,
     TorchTensorRTConfig,
 )
+from model_navigator.configuration.constants import DEFAULT_PICKLE_PROTOCOL_TORCHTRT
 from model_navigator.configuration.model.model_config import ModelConfig
-from model_navigator.core.constants import NAVIGATOR_PACKAGE_VERSION, NAVIGATOR_VERSION
+from model_navigator.core.constants import (
+    NAVIGATOR_PACKAGE_VERSION,
+    NAVIGATOR_VERSION,
+)
 from model_navigator.core.logger import LOGGER
 from model_navigator.core.tensor import TensorMetadata
 from model_navigator.frameworks import Framework
@@ -221,6 +225,7 @@ def __init__(self):
             version.parse("0.2.2"): self._update_from_v0_2_2,
             version.parse("0.2.3"): self._update_from_v0_2_3,
             version.parse("0.3.0"): self._update_from_v0_3_0,
+            version.parse("0.3.1"): self._update_from_v0_3_1,
         }
 
     def update(self, data_dict: Dict, format_version: version.Version):
@@ -572,3 +577,13 @@ def _update_from_v0_3_0(self, data_dict: Dict):
             custom_configs["TorchScript"] = torch_config
 
         config["custom_configs"] = custom_configs
+
+    def _update_from_v0_3_1(self, data_dict: Dict):
+        config = data_dict["config"]
+        custom_configs = config.pop("custom_configs", {})
+        torch_trt_config = custom_configs.get("TorchTensorRT")
+        if torch_trt_config:
+            torch_trt_config["pickle_protocol"] = DEFAULT_PICKLE_PROTOCOL_TORCHTRT
+            custom_configs["TorchTensorRT"] = torch_trt_config
+
+        config["custom_configs"] = custom_configs
diff --git a/tests/unit/base/mocks/packages.py b/tests/unit/base/mocks/packages.py
@@ -17,9 +17,7 @@
 from model_navigator.commands.correctness.correctness import Tolerance, TolerancePerOutputName
 from model_navigator.commands.performance.performance import ProfilingResults
 from model_navigator.configuration import Format, JitType, TensorRTPrecision, TensorRTPrecisionMode, TensorRTProfile
-from model_navigator.configuration.constants import (
-    DEFAULT_MAX_WORKSPACE_SIZE,
-)
+from model_navigator.configuration.constants import DEFAULT_MAX_WORKSPACE_SIZE, DEFAULT_PICKLE_PROTOCOL_TORCHTRT
 from model_navigator.configuration.model.model_config import (
     ONNXModelConfig,
     TensorFlowSavedModelConfig,
@@ -1230,6 +1228,7 @@ def torchscript_package_with_torch_tensorrt(workspace) -> Package:
         precision_mode=TensorRTPrecisionMode.HIERARCHY,
         max_workspace_size=DEFAULT_MAX_WORKSPACE_SIZE,
         trt_profiles=None,
+        pickle_protocol=DEFAULT_PICKLE_PROTOCOL_TORCHTRT,
     )
     package = Package(
         status=Status(

Original file line number	Diff line number	Diff line change
`@@ -37,6 +37,7 @@`
`37`	`37`	`DEFAULT_MAX_WORKSPACE_SIZE_TORCHTRT = (`
`38`	`38`	`0 # Default to use full device memory https://pytorch.org/TensorRT/py_api/dynamo.html`
`39`	`39`	`)`
	`40`	`+DEFAULT_PICKLE_PROTOCOL_TORCHTRT = 5`
`40`	`41`	`DEFAULT_MIN_SEGMENT_SIZE = 3`
`41`	`42`	`DEFAULT_TENSORRT_MAX_DIMENSION_SIZE = 2**31 - 1`
`42`	`43`	`OPT_MAX_SHAPE_RATIO = 4 / 5`