triton-inference-server
diff --git a/‎model_navigator/commands/data_dump/samples.py‎
Lines changed: 1 addition & 0 deletions b/‎model_navigator/commands/data_dump/samples.py‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎model_navigator/commands/export/exporters/torch2dynamo_onnx.py‎
Lines changed: 13 additions & 0 deletions b/‎model_navigator/commands/export/exporters/torch2dynamo_onnx.py‎
Lines changed: 13 additions & 0 deletions
diff --git a/‎model_navigator/commands/export/exporters/torch2exportedprogram.py‎
Lines changed: 42 additions & 24 deletions b/‎model_navigator/commands/export/exporters/torch2exportedprogram.py‎
Lines changed: 42 additions & 24 deletions
diff --git a/‎model_navigator/commands/export/exporters/torch2onnx.py‎
Lines changed: 32 additions & 18 deletions b/‎model_navigator/commands/export/exporters/torch2onnx.py‎
Lines changed: 32 additions & 18 deletions
diff --git a/‎model_navigator/commands/export/exporters/torch2quantized_onnx.py‎
Lines changed: 60 additions & 37 deletions b/‎model_navigator/commands/export/exporters/torch2quantized_onnx.py‎
Lines changed: 60 additions & 37 deletions
@@ -216,6 +216,7 @@ def _run(
                 samples_to_npz(outputs, sample_path, batch_dim, raise_on_error=raise_on_error, num_samples=len(samples))
 
         runner.deactivate()
+        offload_model_to_cpu(model, framework)
 
         return CommandOutput(
             status=CommandStatus.OK,
 
@@ -13,6 +13,7 @@
 # limitations under the License.
 """Export Torch model using dynamo."""
 
+import gc
 import logging
 import pathlib
 from typing import Any, Dict, List, Optional
@@ -151,6 +152,18 @@ def expand_batch_dim(tensor, batch_dim, max_batch_size):
         exported_model.save(exported_model_path.as_posix())
     finally:
         root_logger.setLevel(original_loglevel)
+        # Offload tensors to CPU
+        for arg in args:
+            if isinstance(arg, torch.Tensor):
+                arg.cpu()
+        for value in kwargs.values():
+            if isinstance(value, torch.Tensor):
+                value.cpu()
+
+        del args
+        del kwargs
+        gc.collect()
+        torch.cuda.empty_cache()
 
     _modify_onnx_io_names(exported_model_path, input_names, output_names, exported_model_path)
 
 
@@ -13,6 +13,7 @@
 # limitations under the License.
 """Export Torch model using dynamo."""
 
+import gc
 import pathlib
 from typing import Any, Dict, Optional
 
@@ -59,6 +60,7 @@ def export(
         device_max_batch_size: Maximum batch size that fits on the device. Defaults to None.
     """
     model = get_model()
+    model.to(target_device)
 
     if not navigator_workspace:
         navigator_workspace = pathlib.Path.cwd()
@@ -112,27 +114,43 @@ def export(
         dynamic_shapes.append(dynamic_shape_map)
 
     try:
-        exported_model = torch.export.export(
-            model,
-            args=tuple(args),
-            kwargs=kwargs,
-            dynamic_shapes=dynamic_shapes,
-            **custom_args,
-        )
-    except Exception:
-        exported_model = torch.export._trace._export(
-            model,
-            args=tuple(args),
-            _allow_complex_guards_as_runtime_asserts=True,
-            dynamic_shapes=dynamic_shapes,
-            kwargs=kwargs,
-            **custom_args,
-        )
-
-    exported_model_path = pathlib.Path(exported_model_path)
-    if not exported_model_path.is_absolute():
-        exported_model_path = navigator_workspace / exported_model_path
-
-    torch.export.save(exported_model, exported_model_path.as_posix())
-
-    offload_torch_model_to_cpu(exported_model.module())
+        try:
+            exported_model = torch.export.export(
+                model,
+                args=tuple(args),
+                kwargs=kwargs,
+                dynamic_shapes=dynamic_shapes,
+                **custom_args,
+            )
+        except Exception:
+            exported_model = torch.export._trace._export(
+                model,
+                args=tuple(args),
+                _allow_complex_guards_as_runtime_asserts=True,
+                dynamic_shapes=dynamic_shapes,
+                kwargs=kwargs,
+                **custom_args,
+            )
+
+        exported_model_path = pathlib.Path(exported_model_path)
+        if not exported_model_path.is_absolute():
+            exported_model_path = navigator_workspace / exported_model_path
+
+        torch.export.save(exported_model, exported_model_path.as_posix())
+    finally:
+        if exported_model is not None:
+            offload_torch_model_to_cpu(exported_model.module())
+            del exported_model
+
+        # Offload tensors to CPU
+        for arg in args:
+            if isinstance(arg, torch.Tensor):
+                arg.cpu()
+        for value in kwargs.values():
+            if isinstance(value, torch.Tensor):
+                value.cpu()
+
+        del args
+        del kwargs
+        gc.collect()
+        torch.cuda.empty_cache()
@@ -13,6 +13,7 @@
 # limitations under the License.
 """Export Torch model to ONNX model."""
 
+import gc
 import inspect
 import pathlib
 from typing import Any, Dict, List, Mapping, Optional
@@ -72,54 +73,67 @@ def export(
     profiling_sample = load_samples("profiling_sample", navigator_workspace, batch_dim)[0]
     input_metadata = TensorMetadata.from_json(input_metadata)
 
-    dummy_input = {n: torch.from_numpy(val).to(export_device) for n, val in profiling_sample.items()}
+    dummy_input_map = {n: torch.from_numpy(val).to(export_device) for n, val in profiling_sample.items()}
 
     # adjust input dtypes to match input_metadata
     # TODO: Remove when torch.bfloat16 will be supported
+    dummy_input = {}
     for n, spec in input_metadata.items():
         if not isinstance(spec.dtype, torch.dtype):
             torch_dtype = numpy_to_torch_dtype(spec.dtype)
         else:
             torch_dtype = spec.dtype
-        dummy_input[n] = dummy_input[n].to(torch_dtype)
+        dummy_input[n] = dummy_input_map[n].to(torch_dtype)
 
     dummy_input = input_metadata.unflatten_sample(dummy_input)
 
     # torch.onnx.export requires inputs to be a tuple or tensor
     if isinstance(dummy_input, Mapping):
         dummy_input = (dummy_input,)
 
-    forward_argspec = inspect.getfullargspec(model.forward)
-    forward_args = forward_argspec.args[1:]
-
     args_mapping, kwargs_mapping = input_metadata.pytree_metadata.get_names_mapping()
 
+    # Use inspect.signature instead of getfullargspec for more complete parameter information
+    forward_signature = inspect.signature(model.forward)
+    forward_params = list(forward_signature.parameters.keys())
+
+    args_count = len(args_mapping)
+    forward_kwargs = forward_params[args_count:]
+
     for argname in kwargs_mapping:
-        assert argname in forward_args, f"Argument {argname} is not in forward argspec."
+        assert argname in forward_kwargs, f"Argument {argname} is not in forward argspec."
 
     input_names = []
     for args_names in args_mapping:
         input_names.extend(args_names)
 
-    for argname in forward_args:
+    for argname in forward_kwargs:
         if argname in kwargs_mapping:
             input_names.extend(kwargs_mapping[argname])
 
     exported_model_path = pathlib.Path(exported_model_path)
     if not exported_model_path.is_absolute():
         exported_model_path = navigator_workspace / exported_model_path
 
-    torch.onnx.export(
-        model,
-        args=dummy_input,
-        f=exported_model_path.as_posix(),
-        verbose=False,
-        opset_version=opset,
-        input_names=input_names,
-        output_names=output_names,
-        dynamic_axes=dynamic_axes,
-        **custom_args,
-    )
+    try:
+        torch.onnx.export(
+            model,
+            args=dummy_input,
+            f=exported_model_path.as_posix(),
+            verbose=False,
+            opset_version=opset,
+            input_names=input_names,
+            output_names=output_names,
+            dynamic_axes=dynamic_axes,
+            **custom_args,
+        )
+    finally:
+        for tensor in dummy_input_map.values():
+            tensor.cpu()
+
+        del dummy_input_map
+        gc.collect()
+        torch.cuda.empty_cache()
 
 
 if __name__ == "__main__":
 
@@ -13,6 +13,7 @@
 # limitations under the License.
 """Export PyTorch model to quantized ONNX using ModelOpt."""
 
+import gc
 import inspect
 import pathlib
 from copy import deepcopy
@@ -109,57 +110,30 @@ def export(
     offload_torch_model_to_cpu(original_model)
 
     try:
-        # Move model copy to target device
-        model_copy = model_copy.to(target_device)
-
         # Load calibration samples
         LOGGER.info("Loading calibration samples")
         correctness_samples = load_samples("correctness_samples", navigator_workspace, batch_dim)
         if not correctness_samples:
             LOGGER.error("No correctness samples found for calibration")
             raise RuntimeError("No calibration samples found")
 
-        # Convert samples to PyTorch tensors
-        torch_samples = []
-        for sample in correctness_samples:
-            sample_dict = {}
-            for name, tensor in sample.items():
-                torch_sample = torch.from_numpy(tensor)
-                torch_sample = torch_sample.to(target_device)
-                sample_dict[name] = torch_sample
-            torch_samples.append(sample_dict)
-
-        calibration_data = [list(sample.values()) for sample in torch_samples]
-
-        # Define calibration function
-        def forward_loop(model):
-            for sample in calibration_data:
-                model(*sample)
-
-        LOGGER.info("Using NVFP4_FP8_MHA_CONFIG quantization config for precision NVFP4")
-
-        # Run quantization
-        LOGGER.info("Starting model quantization (this may take several minutes)...")
-        quantized_model = mtq.quantize(
-            model_copy,
-            NVFP4_FP8_MHA_CONFIG,
-            forward_loop,
-        )
+        quantized_model = _quantize_model(model_copy, target_device, correctness_samples)
 
         LOGGER.info("Model quantization completed")
 
         # Prepare input for ONNX export
         input_metadata = TensorMetadata.from_json(input_metadata)
         correct_sample = correctness_samples[0]
-        dummy_input = {n: torch.from_numpy(val).to(target_device) for n, val in correct_sample.items()}
+        dummy_input_map = {n: torch.from_numpy(val).to(target_device) for n, val in correct_sample.items()}
 
         # Adjust input dtypes to match input_metadata
+        dummy_input = {}
         for n, spec in input_metadata.items():
             if not isinstance(spec.dtype, torch.dtype):
                 torch_dtype = numpy_to_torch_dtype(spec.dtype)
             else:
                 torch_dtype = spec.dtype
-            dummy_input[n] = dummy_input[n].to(torch_dtype).to(target_device)
+            dummy_input[n] = dummy_input_map[n].to(torch_dtype).to(target_device)
 
         dummy_input = input_metadata.unflatten_sample(dummy_input)
 
@@ -168,17 +142,20 @@ def forward_loop(model):
             dummy_input = (dummy_input,)
 
         # Get expected function signature for forward method
-        forward_argspec = inspect.getfullargspec(model_copy.forward)
-        forward_args = forward_argspec.args[1:]  # Skip 'self'
+        forward_signature = inspect.signature(model_copy.forward)
+        forward_params = list(forward_signature.parameters.keys())
 
         # Create input_names for ONNX model
         args_mapping, kwargs_mapping = input_metadata.pytree_metadata.get_names_mapping()
 
+        args_count = len(args_mapping)
+        forward_kwargs = forward_params[args_count:]
+
         input_names = []
         for args_names in args_mapping:
             input_names.extend(args_names)
 
-        for argname in forward_args:
+        for argname in forward_kwargs:
             if argname in kwargs_mapping:
                 input_names.extend(kwargs_mapping[argname])
         # Configure quantizers for ONNX export
@@ -226,12 +203,58 @@ def forward_loop(model):
 
         LOGGER.info("Quantized ONNX export completed successfully")
 
-        # Clean up
-        offload_torch_model_to_cpu(model_copy)
-        offload_torch_model_to_cpu(quantized_model)
     except Exception as e:
         LOGGER.error(f"Error during quantized ONNX export: {str(e)}")
         raise
+    finally:
+        # Clean up
+        if model_copy is not None:
+            offload_torch_model_to_cpu(model_copy)
+            del model_copy
+        if quantized_model is not None:
+            offload_torch_model_to_cpu(quantized_model)
+            del quantized_model
+        if dummy_input_map is not None:
+            for tensor in dummy_input_map.values():
+                tensor.cpu()
+            del dummy_input_map
+
+        gc.collect()
+        torch.cuda.empty_cache()
+
+
+def _quantize_model(model, target_device, correctness_samples):
+    # Move model copy to target device
+    model = model.to(target_device)
+
+    # Convert samples to PyTorch tensors
+    torch_samples = []
+    for sample in correctness_samples:
+        sample_dict = {}
+        for name, tensor in sample.items():
+            torch_sample = torch.from_numpy(tensor)
+            torch_sample = torch_sample.to(target_device)
+            sample_dict[name] = torch_sample
+        torch_samples.append(sample_dict)
+
+    calibration_data = [list(sample.values()) for sample in torch_samples]
+
+    # Define calibration function
+    def forward_loop(model):
+        for sample in calibration_data:
+            model(*sample)
+
+    LOGGER.info("Using NVFP4_FP8_MHA_CONFIG quantization config for precision NVFP4")
+
+    # Run quantization
+    LOGGER.info("Starting model quantization (this may take several minutes)...")
+    quantized_model = mtq.quantize(
+        model,
+        NVFP4_FP8_MHA_CONFIG,
+        forward_loop,
+    )
+
+    return quantized_model
 
 
 if __name__ == "__main__":