From 97273d44372f6388f3c9d7497f1debd6c669d32a Mon Sep 17 00:00:00 2001 From: Dheeraj Peri Date: Tue, 1 Apr 2025 01:04:15 -0700 Subject: [PATCH 01/13] fix: Change the translational layer from numpy to torch during conversion to handle additional data types (#3445) --- py/torch_tensorrt/dynamo/_refit.py | 122 +++++++++--------- .../dynamo/conversion/_TRTInterpreter.py | 40 +++--- .../dynamo/conversion/converter_utils.py | 103 +++++++++++++-- .../dynamo/conversion/impl/conv.py | 14 +- .../dynamo/conversion/impl/deconv.py | 14 +- .../dynamo/conversion/impl/quantize.py | 82 ++++++------ tests/py/dynamo/models/test_models.py | 91 +++++++++++++ tests/py/dynamo/models/test_models_export.py | 5 +- 8 files changed, 325 insertions(+), 146 deletions(-) diff --git a/py/torch_tensorrt/dynamo/_refit.py b/py/torch_tensorrt/dynamo/_refit.py index 96fc6daad2..c128e9cc82 100644 --- a/py/torch_tensorrt/dynamo/_refit.py +++ b/py/torch_tensorrt/dynamo/_refit.py @@ -9,6 +9,7 @@ import tensorrt as trt import torch from torch.export import ExportedProgram +from torch.fx.experimental.proxy_tensor import unset_fake_temporarily from torch_tensorrt._enums import dtype from torch_tensorrt._Input import Input from torch_tensorrt.dynamo import partitioning @@ -144,71 +145,72 @@ def _refit_single_trt_engine_with_gm( Refit a TensorRT Engine in place """ - refitted = set() - torch_device = get_model_device(new_gm) - refitter = trt.Refitter(old_engine, TRT_LOGGER) - weight_list = refitter.get_all_weights() - - if weight_name_map: - # Get the refitting mapping - trt_wt_location = ( - trt.TensorLocation.DEVICE - if torch_device.type == "cuda" - else trt.TensorLocation.HOST - ) + with unset_fake_temporarily(): + refitted = set() + torch_device = get_model_device(new_gm) + refitter = trt.Refitter(old_engine, TRT_LOGGER) + weight_list = refitter.get_all_weights() + + if weight_name_map: + # Get the refitting mapping + trt_wt_location = ( + trt.TensorLocation.DEVICE + if torch_device.type == "cuda" + else trt.TensorLocation.HOST + ) - constant_mapping: dict[str, Any] = weight_name_map.pop( - "constant_mapping", {} - ) # type: ignore - mapping = construct_refit_mapping_from_weight_name_map( - weight_name_map, new_gm.state_dict() - ) - constant_mapping_with_type = {} - - for constant_name, val in constant_mapping.items(): - np_weight_type = val.dtype - val_tensor = torch.from_numpy(val).cuda() - trt_dtype = dtype.try_from(np_weight_type).to(trt.DataType) - torch_dtype = dtype.try_from(np_weight_type).to(torch.dtype) - constant_mapping_with_type[constant_name] = ( - val_tensor.clone().reshape(-1).contiguous().to(torch_dtype), - trt_dtype, + constant_mapping: dict[str, Any] = weight_name_map.pop( + "constant_mapping", {} + ) # type: ignore + mapping = construct_refit_mapping_from_weight_name_map( + weight_name_map, new_gm.state_dict() ) + constant_mapping_with_type = {} + + for constant_name, val in constant_mapping.items(): + np_weight_type = val.dtype + val_tensor = torch.from_numpy(val).cuda() + trt_dtype = dtype.try_from(np_weight_type).to(trt.DataType) + torch_dtype = dtype.try_from(np_weight_type).to(torch.dtype) + constant_mapping_with_type[constant_name] = ( + val_tensor.clone().reshape(-1).contiguous().to(torch_dtype), + trt_dtype, + ) - mapping.update(constant_mapping_with_type) + mapping.update(constant_mapping_with_type) - for layer_name in weight_list: - if layer_name not in mapping: - logger.warning(f"{layer_name} is not found in weight mapping.") - continue - # Use Numpy to create weights - weight, weight_dtype = mapping[layer_name] - trt_wt_tensor = trt.Weights( - weight_dtype, weight.data_ptr(), torch.numel(weight) - ) - refitter.set_named_weights(layer_name, trt_wt_tensor, trt_wt_location) - assert ( - len(refitter.get_missing_weights()) == 0 - ), "Fast refitting failed due to incomplete mapping" + for layer_name in weight_list: + if layer_name not in mapping: + logger.warning(f"{layer_name} is not found in weight mapping.") + continue + # Use Numpy to create weights + weight, weight_dtype = mapping[layer_name] + trt_wt_tensor = trt.Weights( + weight_dtype, weight.data_ptr(), torch.numel(weight) + ) + refitter.set_named_weights(layer_name, trt_wt_tensor, trt_wt_location) + assert ( + len(refitter.get_missing_weights()) == 0 + ), "Fast refitting failed due to incomplete mapping" - else: - mapping = construct_refit_mapping(new_gm, input_list, settings) - trt_wt_location = trt.TensorLocation.HOST - for layer_name in weight_list: - if layer_name not in mapping: - raise AssertionError(f"{layer_name} is not found in weight mapping") - # Use Numpy to create weights - weight, datatype = mapping[layer_name] - trt_wt_tensor = trt.Weights(datatype, weight.ctypes.data, weight.size) - refitter.set_named_weights(layer_name, trt_wt_tensor, trt_wt_location) - refitted.add(layer_name) - - if len(refitted) != len(weight_list): - logger.warning("Not all weights have been refitted!!!") - - if not refitter.refit_cuda_engine(): - logger.error("Error: failed to refit new weights.") - raise AssertionError("Refitting failed.") + else: + mapping = construct_refit_mapping(new_gm, input_list, settings) + trt_wt_location = trt.TensorLocation.HOST + for layer_name in weight_list: + if layer_name not in mapping: + raise AssertionError(f"{layer_name} is not found in weight mapping") + # Use Numpy to create weights + weight, datatype = mapping[layer_name] + trt_wt_tensor = trt.Weights(datatype, weight.ctypes.data, weight.size) + refitter.set_named_weights(layer_name, trt_wt_tensor, trt_wt_location) + refitted.add(layer_name) + + if len(refitted) != len(weight_list): + logger.warning("Not all weights have been refitted!!!") + + if not refitter.refit_cuda_engine(): + logger.error("Error: failed to refit new weights.") + raise AssertionError("Refitting failed.") def refit_module_weights( diff --git a/py/torch_tensorrt/dynamo/conversion/_TRTInterpreter.py b/py/torch_tensorrt/dynamo/conversion/_TRTInterpreter.py index 248e06bc3c..17f2fccbff 100644 --- a/py/torch_tensorrt/dynamo/conversion/_TRTInterpreter.py +++ b/py/torch_tensorrt/dynamo/conversion/_TRTInterpreter.py @@ -21,6 +21,7 @@ import tensorrt as trt import torch import torch.fx +from torch.fx.experimental.proxy_tensor import unset_fake_temporarily from torch.fx.node import _get_qualified_name from torch.fx.passes.shape_prop import TensorMetadata from torch.utils._python_dispatch import _disable_current_modes @@ -41,6 +42,7 @@ get_node_io, get_node_name, get_trt_tensor, + to_torch, ) from torch_tensorrt.dynamo.utils import DYNAMIC_DIM, get_model_device, to_torch_device from torch_tensorrt.fx.observer import Observer @@ -408,12 +410,13 @@ def find_weight( np_map: the map from weight name to np values in INetworkDefinition state_dict: state of the graph module """ - network_weight = torch.from_numpy(np_map[weight_name]).to(device) - for sd_w_name, sd_weight in state_dict.items(): - if TRTInterpreter.check_weight_equal(sd_weight, network_weight, device): - del state_dict[sd_w_name] - return sd_w_name - return "" + with unset_fake_temporarily(): + network_weight = torch.from_numpy(np_map[weight_name]).to(device) + for sd_w_name, sd_weight in state_dict.items(): + if TRTInterpreter.check_weight_equal(sd_weight, network_weight, device): + del state_dict[sd_w_name] + return sd_w_name + return "" @staticmethod def check_weight_equal( @@ -421,14 +424,15 @@ def check_weight_equal( network_weight: Union[torch.Tensor, np.ndarray], device: torch.device, ) -> Any: - if not isinstance(network_weight, torch.Tensor): - network_weight = torch.from_numpy(network_weight).to(device) - try: - return sd_weight.shape == network_weight.shape and torch.all( - torch.abs(sd_weight - network_weight) < 0.01 - ) - except Exception: - return torch.all(sd_weight == network_weight) + with unset_fake_temporarily(): + if not isinstance(network_weight, torch.Tensor): + network_weight = torch.from_numpy(network_weight).to(device) + try: + return sd_weight.shape == network_weight.shape and torch.all( + torch.abs(sd_weight - network_weight) < 0.01 + ) + except Exception: + return torch.all(sd_weight == network_weight) def _save_weight_mapping(self) -> None: """ @@ -887,9 +891,7 @@ def call_function(self, target: str, args: Any, kwargs: Any) -> Any: return converter(self.ctx, target, args, kwargs, self._cur_node_name) def get_attr(self, target: str, args: Any, kwargs: Any) -> np.ndarray: - with _disable_current_modes(): - from torch_tensorrt.dynamo.conversion.converter_utils import to_numpy - + with _disable_current_modes(), unset_fake_temporarily(): frozen_attr = self.fetch_attr(target) if isinstance(frozen_attr, torch.nn.Parameter): @@ -897,9 +899,7 @@ def get_attr(self, target: str, args: Any, kwargs: Any) -> np.ndarray: else: constant_tensor = frozen_attr - network_constant = to_numpy(constant_tensor) - - return network_constant + return to_torch(constant_tensor) def call_method(self, target: str, args: Any, kwargs: Any) -> Any: assert isinstance(target, str) diff --git a/py/torch_tensorrt/dynamo/conversion/converter_utils.py b/py/torch_tensorrt/dynamo/conversion/converter_utils.py index 62526080c4..bcb8495c67 100644 --- a/py/torch_tensorrt/dynamo/conversion/converter_utils.py +++ b/py/torch_tensorrt/dynamo/conversion/converter_utils.py @@ -9,6 +9,7 @@ import tensorrt as trt import torch import torch_tensorrt.dynamo.conversion.impl as impl +from torch.fx.experimental.proxy_tensor import unset_fake_temporarily from torch.fx.node import Argument, Target from torch.fx.passes.shape_prop import TensorMetadata from torch_tensorrt import _enums @@ -340,17 +341,47 @@ def create_constant( Returns: A TensorRT ITensor that represents the given value. """ - shape = (1,) - # Rank 0 constant is required in IFillLayer inputs. - if min_rank == 0: - shape = trt.Dims() - numpy_value = to_numpy(value, dtype) - constant = ctx.net.add_constant( - shape if isinstance(value, (int, float, bool)) else value.shape, - numpy_value.copy() if isinstance(numpy_value, np.ndarray) else numpy_value, - ) - constant.name = name - return constant.get_output(0) + with unset_fake_temporarily(): + + torch_value = to_torch(value, dtype) + if torch_value.dtype == torch.float64: + raise ValueError( + "TensorRT does not support float64 (double) precision. To resolve this, please set truncate_double=True in your compilation settings and re-run the model." + ) + # Rank 0 constant is required in IFillLayer inputs. + if min_rank == 0 and isinstance(value, (int, float, bool)): + shape = trt.Dims() + elif list(torch_value.shape) == []: + shape = trt.Dims() + else: + shape = list(torch_value.shape) + + if torch_value is not None: + if torch_value.dtype == torch.bfloat16: + torch_value_fp32 = torch_value.to(torch.float32) + numpy_value = torch_value_fp32.numpy() + else: + numpy_value = torch_value.numpy() + + constant = ctx.net.add_constant( + shape, + numpy_value, + ) + constant.name = name + + if torch_value.dtype == torch.bfloat16: + return cast_trt_tensor( + ctx, + constant.get_output(0), + trt.DataType.BF16, + name + "_bf16_cast", + ) + + return constant.get_output(0) + else: + raise ValueError( + f"Cannot convert tensor '{name}' to a TensorRT constant because its value is None." + ) def get_trt_tensor( @@ -564,6 +595,9 @@ def to_numpy( value = value.dequantize() elif value.dtype == torch.bfloat16: # TODO: Remove when numpy has a BF16 type + _LOGGER.warning( + "Requested a conversion of bfloat16 tensor from torch to numpy which isn't supported. Casting this tensor to FP32 precision currently. Please use to_torch() API for better data representation", + ) value = value.to(torch.float) output = value.cpu().detach().contiguous().numpy() @@ -589,6 +623,53 @@ def to_numpy( ) +def to_torch( + value: Optional[Union[torch.Tensor, np.ndarray, int, float, bool]], + dtype: Optional[Union[torch.dtype, np.dtype, TRTDataType, _enums.dtype]] = None, +) -> Optional[torch.Tensor]: + """ + Convert a Numpy array, or scalar to a PyTorch tensor and move it to CPU + Args: + value (Optional[Union[torch.Tensor, np.ndarray, int, float, bool]]): + A PyTorch tensor, Numpy array, int, float, or bool + dtype (Optional[Union[torch.dtype, np.dtype, TRTDataType]]): + If a dtype is given, we will convert the type of the given `value` to this dtype. + Returns: + A PyTorch tensor or None, if the input was None. + """ + + cpu_device = torch.device("cpu") + torch_dtype = ( + _enums.dtype._from(dtype).to(torch.dtype, use_default=True) if dtype else None + ) + + with unset_fake_temporarily(): + if value is None: + return None + + elif isinstance(value, torch.Tensor): + output = value.to(cpu_device).contiguous() + + elif isinstance(value, np.ndarray): + output = torch.from_numpy(value).to(cpu_device).contiguous() + + elif isinstance(value, int): + output = torch.tensor([value], device=cpu_device, dtype=torch.int32) + + elif isinstance(value, float): + output = torch.tensor([value], device=cpu_device, dtype=torch.float32) + + elif isinstance(value, bool): + output = torch.tensor([value], device=cpu_device, dtype=torch.bool) + + else: + raise AssertionError( + f"to_torch can only be called on None, bool, int, float, np.ndarray, or torch.Tensor, got an object of type: {type(value)}" + ) + + return output.to(torch_dtype) if torch_dtype else output + + def flatten_dims( input: Sequence[Union[TRTTensor, torch.Tensor, np.ndarray]], start_dim: int, diff --git a/py/torch_tensorrt/dynamo/conversion/impl/conv.py b/py/torch_tensorrt/dynamo/conversion/impl/conv.py index 25419d7f60..f27fb13e97 100644 --- a/py/torch_tensorrt/dynamo/conversion/impl/conv.py +++ b/py/torch_tensorrt/dynamo/conversion/impl/conv.py @@ -13,7 +13,7 @@ cast_trt_tensor, extend_attr_to_tuple, get_trt_tensor, - to_numpy, + to_torch, ) from torch_tensorrt.fx.converters.converter_utils import ( get_dyn_range, @@ -45,7 +45,6 @@ def convNd( assert input.shape[1] != -1, "Channel dim can't be dynamic for convolution." num_dims = len(input.shape) - 2 - if is_conv1d: # Apply an unsqueeze operation to transform the conv1d problem into conv2d input = impl.unsqueeze.unsqueeze( @@ -54,8 +53,8 @@ def convNd( # Process bias terms if isinstance(bias, (torch.Tensor, np.ndarray)): - # Transform the bias constant into a Numpy array - bias = to_numpy(bias, dtype=input.dtype) + bias = to_torch(bias, dtype=input.dtype) + bias = get_trt_tensor(ctx, bias, f"{name}_bias") elif isinstance(bias, TRTTensor): bias = get_trt_tensor(ctx, bias, f"{name}_bias") @@ -74,12 +73,11 @@ def convNd( ctx, target, source_ir, weight.name + "_unsqueeze_conv1d", weight, -1 ) elif isinstance(weight, (torch.Tensor, np.ndarray)): - # Transform the weight constant into a Numpy array - weight = to_numpy(weight, dtype=input.dtype) - + weight = to_torch(weight, dtype=input.dtype) # Append new dimension (unsqueeze) if the convolution is 1d if is_conv1d: - weight = np.expand_dims(weight, -1) + weight = torch.unsqueeze(weight, -1) + weight = get_trt_tensor(ctx, weight, f"{name}_weight") else: raise RuntimeError( diff --git a/py/torch_tensorrt/dynamo/conversion/impl/deconv.py b/py/torch_tensorrt/dynamo/conversion/impl/deconv.py index d19a92e646..629cecf5db 100644 --- a/py/torch_tensorrt/dynamo/conversion/impl/deconv.py +++ b/py/torch_tensorrt/dynamo/conversion/impl/deconv.py @@ -6,13 +6,12 @@ import tensorrt as trt import torch from torch.fx.node import Target - from torch_tensorrt.dynamo.conversion import impl from torch_tensorrt.dynamo.conversion._ConversionContext import ConversionContext from torch_tensorrt.dynamo.conversion.converter_utils import ( extend_attr_to_tuple, get_trt_tensor, - to_numpy, + to_torch, ) from torch_tensorrt.fx.converters.converter_utils import ( SourceIR, @@ -53,7 +52,8 @@ def deconvNd( # Process bias terms if isinstance(bias, (torch.Tensor, np.ndarray)): # Transform the bias constant into a Numpy array - bias = to_numpy(bias) + bias = to_torch(bias, dtype=input.dtype) + bias = get_trt_tensor(ctx, bias, f"{name}_bias") elif isinstance(bias, TRTTensor): bias = get_trt_tensor(ctx, bias, f"{name}_bias") @@ -73,12 +73,12 @@ def deconvNd( ) elif isinstance(weight, (torch.Tensor, np.ndarray)): - # Transform the weight constant into a Numpy array - weight = to_numpy(weight) - + weight = to_torch(weight, dtype=input.dtype) # Append new dimension (unsqueeze) if the deconvolution is 1d if is_deconv1d: - weight = np.expand_dims(weight, axis=-1) + weight = torch.unsqueeze(weight, -1) + + weight = get_trt_tensor(ctx, weight, f"{name}_weight") else: raise RuntimeError( diff --git a/py/torch_tensorrt/dynamo/conversion/impl/quantize.py b/py/torch_tensorrt/dynamo/conversion/impl/quantize.py index b97840cd09..e472ed3092 100644 --- a/py/torch_tensorrt/dynamo/conversion/impl/quantize.py +++ b/py/torch_tensorrt/dynamo/conversion/impl/quantize.py @@ -1,11 +1,13 @@ -from typing import Optional +from typing import Optional, Union import numpy as np import tensorrt as trt +import torch +from torch.fx.experimental.proxy_tensor import unset_fake_temporarily from torch.fx.node import Target from torch_tensorrt.dynamo._SourceIR import SourceIR from torch_tensorrt.dynamo.conversion._ConversionContext import ConversionContext -from torch_tensorrt.dynamo.conversion.converter_utils import get_trt_tensor +from torch_tensorrt.dynamo.conversion.converter_utils import get_trt_tensor, to_torch from torch_tensorrt.fx.converters.converter_utils import set_layer_name from torch_tensorrt.fx.types import TRTTensor @@ -16,7 +18,7 @@ def quantize( source_ir: Optional[SourceIR], name: str, input_tensor: TRTTensor, - amax: np.ndarray, + amax: Union[np.ndarray, torch.Tensor], num_bits: int, exponent_bits: int, ) -> TRTTensor: @@ -24,40 +26,44 @@ def quantize( Adds quantize and dequantize ops (QDQ) which quantize to INT8 or FP8 based on the output_type set and dequantizes them back. """ - if isinstance(input_tensor, TRTTensor) and input_tensor.dtype not in ( - trt.float32, - trt.float16, - ): - raise ValueError( - f"quantize converter received an input of {input_tensor.dtype} type. Supported types: float32 | float16" - ) - if num_bits != 8 or exponent_bits not in (0, 4): - raise ValueError( - f"quantize converter currently only accept INT8 or FP8 based quantize, got {num_bits=}, {exponent_bits=}" - ) - if num_bits == 8 and exponent_bits == 0: - max_bound = 127 - elif num_bits == 8 and exponent_bits == 4: - max_bound = 448 - scale = np.divide(amax, max_bound) - scale = get_trt_tensor(ctx, scale, name + "_scale") - # Add Q node - quantize_layer = ctx.net.add_quantize(input_tensor, scale) - if num_bits == 8 and exponent_bits == 0: - quantize_layer.set_output_type(0, trt.DataType.INT8) - elif num_bits == 8 and exponent_bits == 4: - quantize_layer.set_output_type(0, trt.DataType.FP8) - set_layer_name(quantize_layer, target, name + "_quantize", source_ir) - q_output = quantize_layer.get_output(0) - # Add DQ node - dequantize_layer = ctx.net.add_dequantize(q_output, scale) - set_layer_name(dequantize_layer, target, name + "_dequantize", source_ir) - if num_bits == 8 and exponent_bits == 0: - dequantize_layer.precision = trt.DataType.INT8 - elif num_bits == 8 and exponent_bits == 4: - # Set DQ layer precision to FP8 - dequantize_layer.precision = trt.DataType.FP8 - dq_output = dequantize_layer.get_output(0) + with unset_fake_temporarily(): + if isinstance(input_tensor, TRTTensor) and input_tensor.dtype not in ( + trt.float32, + trt.float16, + ): + raise ValueError( + f"quantize converter received an input of {input_tensor.dtype} type. Supported types: float32 | float16" + ) + if num_bits != 8 or exponent_bits not in (0, 4): + raise ValueError( + f"quantize converter currently only accept INT8 or FP8 based quantize, got {num_bits=}, {exponent_bits=}" + ) + if num_bits == 8 and exponent_bits == 0: + max_bound = 127 + elif num_bits == 8 and exponent_bits == 4: + max_bound = 448 - return dq_output + amax = to_torch(amax, None) + scale = torch.divide(amax, max_bound) + scale = get_trt_tensor(ctx, scale, name + "_scale") + # Add Q node + quantize_layer = ctx.net.add_quantize(input_tensor, scale) + if num_bits == 8 and exponent_bits == 0: + quantize_layer.set_output_type(0, trt.DataType.INT8) + elif num_bits == 8 and exponent_bits == 4: + quantize_layer.set_output_type(0, trt.DataType.FP8) + + set_layer_name(quantize_layer, target, name + "_quantize", source_ir) + q_output = quantize_layer.get_output(0) + # Add DQ node + dequantize_layer = ctx.net.add_dequantize(q_output, scale) + set_layer_name(dequantize_layer, target, name + "_dequantize", source_ir) + if num_bits == 8 and exponent_bits == 0: + dequantize_layer.precision = trt.DataType.INT8 + elif num_bits == 8 and exponent_bits == 4: + # Set DQ layer precision to FP8 + dequantize_layer.precision = trt.DataType.FP8 + dq_output = dequantize_layer.get_output(0) + + return dq_output diff --git a/tests/py/dynamo/models/test_models.py b/tests/py/dynamo/models/test_models.py index b6f986711a..6314baa5ec 100644 --- a/tests/py/dynamo/models/test_models.py +++ b/tests/py/dynamo/models/test_models.py @@ -182,3 +182,94 @@ def test_resnet18_half(ir): # Clean up model env torch._dynamo.reset() + + +@pytest.mark.unit +def test_bf16_model(ir): + class MyModule(torch.nn.Module): + def __init__(self): + super().__init__() + self.conv = torch.nn.Conv2d(3, 16, 3, stride=1, bias=True) + self.relu = torch.nn.ReLU() + + def forward(self, x): + out = self.conv(x) + out = self.relu(out) + return out + + model = MyModule().eval().cuda().to(torch.bfloat16) + input = torch.randn((1, 3, 224, 224)).to("cuda").to(torch.bfloat16) + + compile_spec = { + "inputs": [ + torchtrt.Input( + input.shape, dtype=torch.bfloat16, format=torch.contiguous_format + ) + ], + "device": torchtrt.Device("cuda:0"), + "enabled_precisions": {torch.float32}, + "ir": ir, + "pass_through_build_failures": True, + "min_block_size": 1, + "cache_built_engines": False, + "reuse_cached_engines": False, + "use_explicit_typing": True, + } + + trt_mod = torchtrt.compile(model, **compile_spec) + cos_sim = cosine_similarity(model(input), trt_mod(input)) + + assertions.assertTrue( + cos_sim > COSINE_THRESHOLD, + msg=f"BF16 model TRT outputs don't match with the original model. Cosine sim score: {cos_sim} Threshold: {COSINE_THRESHOLD}", + ) + + # Clean up model env + torch._dynamo.reset() + + +@pytest.mark.unit +def test_bf16_fallback_model(ir): + class MyModule(torch.nn.Module): + def __init__(self): + super().__init__() + self.conv = torch.nn.Conv2d(3, 16, 3, padding=1, stride=1, bias=True) + self.relu = torch.nn.ReLU() + self.conv2 = torch.nn.Conv2d(16, 16, 3, padding=1, stride=1, bias=True) + + def forward(self, x): + out = self.conv(x) + out = self.relu(out) + out = self.conv2(out) + return out + + model = MyModule().eval().cuda().to(torch.bfloat16) + input = torch.randn((1, 3, 224, 224)).to("cuda").to(torch.bfloat16) + + compile_spec = { + "inputs": [ + torchtrt.Input( + input.shape, dtype=torch.bfloat16, format=torch.contiguous_format + ) + ], + "device": torchtrt.Device("cuda:0"), + "enabled_precisions": {torch.float32}, + "ir": ir, + "pass_through_build_failures": True, + "min_block_size": 1, + "cache_built_engines": False, + "reuse_cached_engines": False, + "use_explicit_typing": True, + "torch_executed_ops": {"torch.ops.aten.relu.default"}, + } + + trt_mod = torchtrt.compile(model, **compile_spec) + cos_sim = cosine_similarity(model(input), trt_mod(input)) + + assertions.assertTrue( + cos_sim > COSINE_THRESHOLD, + msg=f"BF16 fallback model TRT outputs don't match with the original model. Cosine sim score: {cos_sim} Threshold: {COSINE_THRESHOLD}", + ) + + # Clean up model env + torch._dynamo.reset() diff --git a/tests/py/dynamo/models/test_models_export.py b/tests/py/dynamo/models/test_models_export.py index 469ed569d1..6f96e259b0 100644 --- a/tests/py/dynamo/models/test_models_export.py +++ b/tests/py/dynamo/models/test_models_export.py @@ -249,6 +249,7 @@ def calibrate_loop(model): @unittest.skipIf( platform.system() != "Linux" + or torch.cuda.get_device_capability() < (8, 9) or not importlib.util.find_spec("modelopt") or Version(metadata.version("nvidia-modelopt")) < Version("0.17.0"), "modelopt 0.17.0 or later is required, Int8 quantization is supported in modelopt since 0.17.0 or later for linux", @@ -257,7 +258,6 @@ def calibrate_loop(model): def test_base_int8(ir): import modelopt.torch.quantization as mtq from modelopt.torch.quantization.utils import export_torch_mode - from torch.export._trace import _export class SimpleNetwork(torch.nn.Module): def __init__(self): @@ -285,7 +285,7 @@ def calibrate_loop(model): with torch.no_grad(): with export_torch_mode(): - exp_program = _export(model, (input_tensor,)) + exp_program = torch.export.export(model, (input_tensor,)) trt_model = torchtrt.dynamo.compile( exp_program, inputs=[input_tensor], @@ -294,6 +294,7 @@ def calibrate_loop(model): debug=True, cache_built_engines=False, reuse_cached_engines=False, + truncate_double=True, ) outputs_trt = trt_model(input_tensor) assert torch.allclose(output_pyt, outputs_trt, rtol=5e-3, atol=1e-2) From 3b177ac130af9e3d8961a21dd34f21543bf4ad44 Mon Sep 17 00:00:00 2001 From: Dheeraj Peri Date: Tue, 1 Apr 2025 01:09:40 -0700 Subject: [PATCH 02/13] chore: reenable the test --- tests/py/dynamo/models/test_models_export.py | 1 - 1 file changed, 1 deletion(-) diff --git a/tests/py/dynamo/models/test_models_export.py b/tests/py/dynamo/models/test_models_export.py index 6f96e259b0..f5230f3ace 100644 --- a/tests/py/dynamo/models/test_models_export.py +++ b/tests/py/dynamo/models/test_models_export.py @@ -249,7 +249,6 @@ def calibrate_loop(model): @unittest.skipIf( platform.system() != "Linux" - or torch.cuda.get_device_capability() < (8, 9) or not importlib.util.find_spec("modelopt") or Version(metadata.version("nvidia-modelopt")) < Version("0.17.0"), "modelopt 0.17.0 or later is required, Int8 quantization is supported in modelopt since 0.17.0 or later for linux", From 112f67b761a853d321dc4eb1423ec73fb08218b8 Mon Sep 17 00:00:00 2001 From: Dheeraj Peri Date: Tue, 1 Apr 2025 11:32:33 -0700 Subject: [PATCH 03/13] chore: separate the tests --- .github/workflows/build-test-linux.yml | 1 + tests/py/dynamo/models/test_models_export.py | 105 ------------------- 2 files changed, 1 insertion(+), 105 deletions(-) diff --git a/.github/workflows/build-test-linux.yml b/.github/workflows/build-test-linux.yml index 024afd8c62..91b7f6633b 100644 --- a/.github/workflows/build-test-linux.yml +++ b/.github/workflows/build-test-linux.yml @@ -174,6 +174,7 @@ jobs: python -m pip install -r requirements.txt cd dynamo python -m pytest -ra --junitxml=${RUNNER_TEST_RESULTS_DIR}/dyn_models_export.xml --ir dynamo models/ + python -m pytest -ra --junitxml=${RUNNER_TEST_RESULTS_DIR}/dyn_models_export.xml --ir dynamo test_modelopt_models.py popd tests-py-dynamo-serde: diff --git a/tests/py/dynamo/models/test_models_export.py b/tests/py/dynamo/models/test_models_export.py index f5230f3ace..9dfc7abebd 100644 --- a/tests/py/dynamo/models/test_models_export.py +++ b/tests/py/dynamo/models/test_models_export.py @@ -192,108 +192,3 @@ def test_resnet18_half(ir): # Clean up model env torch._dynamo.reset() - - -@unittest.skipIf( - torch.cuda.get_device_capability() < (8, 9), - "FP8 quantization requires compute capability 8.9 or later", -) -@unittest.skipIf( - not importlib.util.find_spec("modelopt"), - "ModelOpt is required to run this test", -) -@pytest.mark.unit -def test_base_fp8(ir): - import modelopt.torch.quantization as mtq - from modelopt.torch.quantization.utils import export_torch_mode - - class SimpleNetwork(torch.nn.Module): - def __init__(self): - super(SimpleNetwork, self).__init__() - self.linear1 = torch.nn.Linear(in_features=10, out_features=5) - self.linear2 = torch.nn.Linear(in_features=5, out_features=1) - - def forward(self, x): - x = self.linear1(x) - x = torch.nn.ReLU()(x) - x = self.linear2(x) - return x - - def calibrate_loop(model): - """Simple calibration function for testing.""" - model(input_tensor) - - input_tensor = torch.randn(1, 10).cuda() - model = SimpleNetwork().eval().cuda() - - quant_cfg = mtq.FP8_DEFAULT_CFG - mtq.quantize(model, quant_cfg, forward_loop=calibrate_loop) - # model has FP8 qdq nodes at this point - output_pyt = model(input_tensor) - - with torch.no_grad(): - with export_torch_mode(): - exp_program = torch.export.export(model, (input_tensor,), strict=False) - trt_model = torchtrt.dynamo.compile( - exp_program, - inputs=[input_tensor], - enabled_precisions={torch.float8_e4m3fn}, - min_block_size=1, - debug=True, - cache_built_engines=False, - reuse_cached_engines=False, - ) - outputs_trt = trt_model(input_tensor) - assert torch.allclose(output_pyt, outputs_trt, rtol=5e-3, atol=1e-2) - - -@unittest.skipIf( - platform.system() != "Linux" - or not importlib.util.find_spec("modelopt") - or Version(metadata.version("nvidia-modelopt")) < Version("0.17.0"), - "modelopt 0.17.0 or later is required, Int8 quantization is supported in modelopt since 0.17.0 or later for linux", -) -@pytest.mark.unit -def test_base_int8(ir): - import modelopt.torch.quantization as mtq - from modelopt.torch.quantization.utils import export_torch_mode - - class SimpleNetwork(torch.nn.Module): - def __init__(self): - super(SimpleNetwork, self).__init__() - self.linear1 = torch.nn.Linear(in_features=10, out_features=5) - self.linear2 = torch.nn.Linear(in_features=5, out_features=1) - - def forward(self, x): - x = self.linear1(x) - x = torch.nn.ReLU()(x) - x = self.linear2(x) - return x - - def calibrate_loop(model): - """Simple calibration function for testing.""" - model(input_tensor) - - input_tensor = torch.randn(1, 10).cuda() - model = SimpleNetwork().eval().cuda() - - quant_cfg = mtq.INT8_DEFAULT_CFG - mtq.quantize(model, quant_cfg, forward_loop=calibrate_loop) - # model has INT8 qdq nodes at this point - output_pyt = model(input_tensor) - - with torch.no_grad(): - with export_torch_mode(): - exp_program = torch.export.export(model, (input_tensor,)) - trt_model = torchtrt.dynamo.compile( - exp_program, - inputs=[input_tensor], - enabled_precisions={torch.int8}, - min_block_size=1, - debug=True, - cache_built_engines=False, - reuse_cached_engines=False, - truncate_double=True, - ) - outputs_trt = trt_model(input_tensor) - assert torch.allclose(output_pyt, outputs_trt, rtol=5e-3, atol=1e-2) From 736ff2520e741f9a0570ae4a9fbbcb29284cf3c2 Mon Sep 17 00:00:00 2001 From: Dheeraj Peri Date: Tue, 1 Apr 2025 12:40:54 -0700 Subject: [PATCH 04/13] chore: minor fix --- .github/workflows/build-test-linux.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/build-test-linux.yml b/.github/workflows/build-test-linux.yml index 91b7f6633b..7c92573f4e 100644 --- a/.github/workflows/build-test-linux.yml +++ b/.github/workflows/build-test-linux.yml @@ -174,7 +174,7 @@ jobs: python -m pip install -r requirements.txt cd dynamo python -m pytest -ra --junitxml=${RUNNER_TEST_RESULTS_DIR}/dyn_models_export.xml --ir dynamo models/ - python -m pytest -ra --junitxml=${RUNNER_TEST_RESULTS_DIR}/dyn_models_export.xml --ir dynamo test_modelopt_models.py + python -m pytest -ra --junitxml=${RUNNER_TEST_RESULTS_DIR}/dyn_models_export.xml test_modelopt_models.py popd tests-py-dynamo-serde: From b37add3aa3e62b7ec16c0e7419a1fd9b03db26b5 Mon Sep 17 00:00:00 2001 From: Dheeraj Peri Date: Tue, 8 Apr 2025 15:11:16 -0700 Subject: [PATCH 05/13] chore: updates --- tests/py/dynamo/test_modelopt_models.py | 118 ++++++++++++++++++++++++ 1 file changed, 118 insertions(+) create mode 100644 tests/py/dynamo/test_modelopt_models.py diff --git a/tests/py/dynamo/test_modelopt_models.py b/tests/py/dynamo/test_modelopt_models.py new file mode 100644 index 0000000000..2593d03e74 --- /dev/null +++ b/tests/py/dynamo/test_modelopt_models.py @@ -0,0 +1,118 @@ +# type: ignore +import importlib +import platform +import unittest +from importlib import metadata + +import pytest +import torch +import torch_tensorrt as torchtrt + +from packaging.version import Version + +assertions = unittest.TestCase() + + +@unittest.skipIf( + torch.cuda.get_device_capability() < (8, 9), + "FP8 quantization requires compute capability 8.9 or later", +) +@unittest.skipIf( + not importlib.util.find_spec("modelopt"), + "ModelOpt is required to run this test", +) +@pytest.mark.unit +def test_base_fp8(ir): + import modelopt.torch.quantization as mtq + from modelopt.torch.quantization.utils import export_torch_mode + + class SimpleNetwork(torch.nn.Module): + def __init__(self): + super(SimpleNetwork, self).__init__() + self.linear1 = torch.nn.Linear(in_features=10, out_features=5) + self.linear2 = torch.nn.Linear(in_features=5, out_features=1) + + def forward(self, x): + x = self.linear1(x) + x = torch.nn.ReLU()(x) + x = self.linear2(x) + return x + + def calibrate_loop(model): + """Simple calibration function for testing.""" + model(input_tensor) + + input_tensor = torch.randn(1, 10).cuda() + model = SimpleNetwork().eval().cuda() + + quant_cfg = mtq.FP8_DEFAULT_CFG + mtq.quantize(model, quant_cfg, forward_loop=calibrate_loop) + # model has FP8 qdq nodes at this point + output_pyt = model(input_tensor) + + with torch.no_grad(): + with export_torch_mode(): + exp_program = torch.export.export(model, (input_tensor,), strict=False) + trt_model = torchtrt.dynamo.compile( + exp_program, + inputs=[input_tensor], + enabled_precisions={torch.float8_e4m3fn}, + min_block_size=1, + debug=True, + cache_built_engines=False, + reuse_cached_engines=False, + ) + outputs_trt = trt_model(input_tensor) + assert torch.allclose(output_pyt, outputs_trt, rtol=5e-3, atol=1e-2) + + +@unittest.skipIf( + platform.system() != "Linux" + or not importlib.util.find_spec("modelopt") + or Version(metadata.version("nvidia-modelopt")) < Version("0.17.0"), + "modelopt 0.17.0 or later is required, Int8 quantization is supported in modelopt since 0.17.0 or later for linux", +) +@pytest.mark.unit +def test_base_int8(ir): + import modelopt.torch.quantization as mtq + from modelopt.torch.quantization.utils import export_torch_mode + + class SimpleNetwork(torch.nn.Module): + def __init__(self): + super(SimpleNetwork, self).__init__() + self.linear1 = torch.nn.Linear(in_features=10, out_features=5) + self.linear2 = torch.nn.Linear(in_features=5, out_features=1) + + def forward(self, x): + x = self.linear1(x) + x = torch.nn.ReLU()(x) + x = self.linear2(x) + return x + + def calibrate_loop(model): + """Simple calibration function for testing.""" + model(input_tensor) + + input_tensor = torch.randn(1, 10).cuda() + model = SimpleNetwork().eval().cuda() + + quant_cfg = mtq.INT8_DEFAULT_CFG + mtq.quantize(model, quant_cfg, forward_loop=calibrate_loop) + # model has INT8 qdq nodes at this point + output_pyt = model(input_tensor) + + with torch.no_grad(): + with export_torch_mode(): + exp_program = torch.export.export(model, (input_tensor,)) + trt_model = torchtrt.dynamo.compile( + exp_program, + inputs=[input_tensor], + enabled_precisions={torch.int8}, + min_block_size=1, + debug=True, + cache_built_engines=False, + reuse_cached_engines=False, + truncate_double=True, + ) + outputs_trt = trt_model(input_tensor) + assert torch.allclose(output_pyt, outputs_trt, rtol=5e-3, atol=1e-2) From 133d3e27e9805f2bde326dfd91d0229768fffd61 Mon Sep 17 00:00:00 2001 From: Dheeraj Peri Date: Wed, 9 Apr 2025 15:07:42 -0700 Subject: [PATCH 06/13] chore: updates --- tests/py/dynamo/test_modelopt_models.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/py/dynamo/test_modelopt_models.py b/tests/py/dynamo/test_modelopt_models.py index 2593d03e74..50ccd2880d 100644 --- a/tests/py/dynamo/test_modelopt_models.py +++ b/tests/py/dynamo/test_modelopt_models.py @@ -22,7 +22,7 @@ "ModelOpt is required to run this test", ) @pytest.mark.unit -def test_base_fp8(ir): +def test_base_fp8(): import modelopt.torch.quantization as mtq from modelopt.torch.quantization.utils import export_torch_mode @@ -73,7 +73,7 @@ def calibrate_loop(model): "modelopt 0.17.0 or later is required, Int8 quantization is supported in modelopt since 0.17.0 or later for linux", ) @pytest.mark.unit -def test_base_int8(ir): +def test_base_int8(): import modelopt.torch.quantization as mtq from modelopt.torch.quantization.utils import export_torch_mode From 1835fa3ee3a03b2dedbb15bdd2ab3fce74966ea9 Mon Sep 17 00:00:00 2001 From: Dheeraj Peri Date: Thu, 17 Apr 2025 16:54:00 -0700 Subject: [PATCH 07/13] chore: bug fix --- .../dynamo/conversion/converter_utils.py | 63 ++++++++++--------- 1 file changed, 33 insertions(+), 30 deletions(-) diff --git a/py/torch_tensorrt/dynamo/conversion/converter_utils.py b/py/torch_tensorrt/dynamo/conversion/converter_utils.py index bcb8495c67..375eed65a2 100644 --- a/py/torch_tensorrt/dynamo/conversion/converter_utils.py +++ b/py/torch_tensorrt/dynamo/conversion/converter_utils.py @@ -585,42 +585,45 @@ def to_numpy( Returns: A Numpy array or None, if the input was None. """ - output = None + with unset_fake_temporarily(): + output = None - if value is None or isinstance(value, np.ndarray): - output = value + if value is None or isinstance(value, np.ndarray): + output = value - elif isinstance(value, torch.Tensor): - if value.is_quantized: - value = value.dequantize() - elif value.dtype == torch.bfloat16: - # TODO: Remove when numpy has a BF16 type - _LOGGER.warning( - "Requested a conversion of bfloat16 tensor from torch to numpy which isn't supported. Casting this tensor to FP32 precision currently. Please use to_torch() API for better data representation", - ) - value = value.to(torch.float) - - output = value.cpu().detach().contiguous().numpy() + elif isinstance(value, torch.Tensor): + if value.is_quantized: + value = value.dequantize() + elif value.dtype == torch.bfloat16: + # TODO: Remove when numpy has a BF16 type + _LOGGER.warning( + "Requested a conversion of bfloat16 tensor from torch to numpy which isn't supported. Casting this tensor to FP32 precision currently. Please use to_torch() API for better data representation", + ) + value = value.to(torch.float) - elif isinstance(value, int): - output = np.array([value], dtype=np.int32) + output = value.cpu().detach().contiguous().numpy() - elif isinstance(value, float): - output = np.array([value], dtype=np.float32) + elif isinstance(value, int): + output = np.array([value], dtype=np.int32) - elif isinstance(value, bool): - output = np.array([value], dtype=np.bool_) + elif isinstance(value, float): + output = np.array([value], dtype=np.float32) - if isinstance(output, np.ndarray) or output is None: - return ( - output - if (dtype is None or output is None) - else output.astype(_enums.dtype._from(dtype).to(np.dtype, use_default=True)) - ) - else: - raise AssertionError( - f"to_numpy can only be called on None, bool, int, float, np.ndarray, or torch.Tensor, got: {value}" - ) + elif isinstance(value, bool): + output = np.array([value], dtype=np.bool_) + + if isinstance(output, np.ndarray) or output is None: + return ( + output + if (dtype is None or output is None) + else output.astype( + _enums.dtype._from(dtype).to(np.dtype, use_default=True) + ) + ) + else: + raise AssertionError( + f"to_numpy can only be called on None, bool, int, float, np.ndarray, or torch.Tensor, got: {value}" + ) def to_torch( From c57aa0491f8b80b144ad33fa158c98f590ef8398 Mon Sep 17 00:00:00 2001 From: Dheeraj Peri Date: Tue, 22 Apr 2025 10:30:42 -0700 Subject: [PATCH 08/13] chore: bug fixes for refit tests, restructure CI tests --- .github/workflows/build-test-linux.yml | 9 +- py/torch_tensorrt/dynamo/_refit.py | 66 ++---- .../{ => models}/test_modelopt_models.py | 0 tests/py/dynamo/models/test_models_export.py | 199 ------------------ 4 files changed, 30 insertions(+), 244 deletions(-) rename tests/py/dynamo/{ => models}/test_modelopt_models.py (100%) delete mode 100644 tests/py/dynamo/models/test_models_export.py diff --git a/.github/workflows/build-test-linux.yml b/.github/workflows/build-test-linux.yml index 300dbd9d0d..9efb2ab146 100644 --- a/.github/workflows/build-test-linux.yml +++ b/.github/workflows/build-test-linux.yml @@ -172,8 +172,12 @@ jobs: cd tests/py python -m pip install -r requirements.txt cd dynamo - python -m pytest -ra --junitxml=${RUNNER_TEST_RESULTS_DIR}/dyn_models_export.xml --ir dynamo models/ - python -m pytest -ra --junitxml=${RUNNER_TEST_RESULTS_DIR}/dyn_models_export.xml test_modelopt_models.py + python -m pytest -ra --junitxml=${RUNNER_TEST_RESULTS_DIR}/dynamo_models.xml --ir dynamo models/test_models.py + python -m pytest -ra --junitxml=${RUNNER_TEST_RESULTS_DIR}/dynamo_models_dynamic.xml --ir dynamo models/test_dyn_models.py + python -m pytest -ra --junitxml=${RUNNER_TEST_RESULTS_DIR}/engine_cache.xml --ir dynamo models/test_engine_cache.py + python -m pytest -ra --junitxml=${RUNNER_TEST_RESULTS_DIR}/model_refit.xml --ir dynamo models/test_model_refit.py + python -m pytest -ra --junitxml=${RUNNER_TEST_RESULTS_DIR}/modelopt_models.xml --ir dynamo models/test_modelopt_models.py + python -m pytest -ra --junitxml=${RUNNER_TEST_RESULTS_DIR}/weight_stripped_engine.xml --ir dynamo models/test_weight_stripped_engine.py popd tests-py-dynamo-serde: @@ -206,6 +210,7 @@ jobs: cd dynamo python -m pytest -ra --junitxml=${RUNNER_TEST_RESULTS_DIR}/export_serde_test_results.xml --ir dynamo models/test_export_serde.py python -m pytest -ra --junitxml=${RUNNER_TEST_RESULTS_DIR}/reexport_test_results.xml --ir dynamo models/test_reexport.py + python -m pytest -ra --junitxml=${RUNNER_TEST_RESULTS_DIR}/export_kwargs_serde_test_results.xml --ir dynamo models/test_export_kwargs_serde.py popd tests-py-torch-compile-be: diff --git a/py/torch_tensorrt/dynamo/_refit.py b/py/torch_tensorrt/dynamo/_refit.py index 1082222aab..f1215730a9 100644 --- a/py/torch_tensorrt/dynamo/_refit.py +++ b/py/torch_tensorrt/dynamo/_refit.py @@ -48,7 +48,7 @@ logger = logging.getLogger(__name__) -@needs_refit +@needs_refit # type: ignore def construct_refit_mapping( module: torch.fx.GraphModule, inputs: Sequence[Input], @@ -110,7 +110,7 @@ def construct_refit_mapping( return weight_map -@needs_refit +@needs_refit # type: ignore def construct_refit_mapping_from_weight_name_map( weight_name_map: dict[Any, Any], state_dict: dict[Any, Any], @@ -141,7 +141,7 @@ def construct_refit_mapping_from_weight_name_map( return engine_weight_map -@needs_refit +@needs_refit # type: ignore def _refit_single_trt_engine_with_gm( new_gm: torch.fx.GraphModule, old_engine: trt.ICudaEngine, @@ -153,12 +153,12 @@ def _refit_single_trt_engine_with_gm( Refit a TensorRT Engine in place """ - with unset_fake_temporarily(): - refitted = set() - torch_device = get_model_device(new_gm) - refitter = trt.Refitter(old_engine, TRT_LOGGER) - weight_list = refitter.get_all_weights() + refitted = set() + torch_device = get_model_device(new_gm) + refitter = trt.Refitter(old_engine, TRT_LOGGER) + weight_list = refitter.get_all_weights() + with unset_fake_temporarily(): if weight_name_map: # Get the refitting mapping trt_wt_location = ( @@ -185,41 +185,21 @@ def _refit_single_trt_engine_with_gm( trt_dtype, ) - constant_mapping: dict[str, Any] = weight_name_map.pop( - "constant_mapping", {} - ) # type: ignore - mapping = construct_refit_mapping_from_weight_name_map( - weight_name_map, new_gm.state_dict() - ) - constant_mapping_with_type = {} - - for constant_name, val in constant_mapping.items(): - np_weight_type = val.dtype - val_tensor = torch.from_numpy(val).cuda() - trt_dtype = dtype.try_from(np_weight_type).to(trt.DataType) - torch_dtype = dtype.try_from(np_weight_type).to(torch.dtype) - constant_mapping_with_type[constant_name] = ( - val_tensor.clone().reshape(-1).contiguous().to(torch_dtype), - trt_dtype, - ) + mapping.update(constant_mapping_with_type) - mapping.update(constant_mapping_with_type) - - for layer_name in weight_list: - if layer_name not in mapping: - logger.warning(f"{layer_name} is not found in weight mapping.") - continue - # Use Numpy to create weights - weight, weight_dtype = mapping[layer_name] - trt_wt_tensor = trt.Weights( - weight_dtype, weight.data_ptr(), torch.numel(weight) - ) - refitter.set_named_weights( - layer_name, trt_wt_tensor, trt_wt_location - ) - assert ( - len(refitter.get_missing_weights()) == 0 - ), "Fast refitting failed due to incomplete mapping" + for layer_name in weight_list: + if layer_name not in mapping: + logger.warning(f"{layer_name} is not found in weight mapping.") + continue + # Use Numpy to create weights + weight, weight_dtype = mapping[layer_name] + trt_wt_tensor = trt.Weights( + weight_dtype, weight.data_ptr(), torch.numel(weight) + ) + refitter.set_named_weights(layer_name, trt_wt_tensor, trt_wt_location) + assert ( + len(refitter.get_missing_weights()) == 0 + ), "Fast refitting failed due to incomplete mapping" else: mapping = construct_refit_mapping(new_gm, input_list, settings) @@ -241,7 +221,7 @@ def _refit_single_trt_engine_with_gm( raise AssertionError("Refitting failed.") -@needs_refit +@needs_refit # type: ignore def refit_module_weights( compiled_module: torch.fx.GraphModule | ExportedProgram, new_weight_module: ExportedProgram, diff --git a/tests/py/dynamo/test_modelopt_models.py b/tests/py/dynamo/models/test_modelopt_models.py similarity index 100% rename from tests/py/dynamo/test_modelopt_models.py rename to tests/py/dynamo/models/test_modelopt_models.py diff --git a/tests/py/dynamo/models/test_models_export.py b/tests/py/dynamo/models/test_models_export.py deleted file mode 100644 index 005594d62e..0000000000 --- a/tests/py/dynamo/models/test_models_export.py +++ /dev/null @@ -1,199 +0,0 @@ -# type: ignore -import importlib -import platform -import unittest -from importlib import metadata - -import pytest -import timm -import torch -import torch_tensorrt as torchtrt -import torchvision.models as models -from torch_tensorrt.dynamo.utils import COSINE_THRESHOLD, cosine_similarity - -from packaging.version import Version - -assertions = unittest.TestCase() - - -@pytest.mark.unit -def test_resnet18(ir): - model = models.resnet18(pretrained=True).eval().to("cuda") - input = torch.randn((1, 3, 224, 224)).to("cuda") - - compile_spec = { - "inputs": [ - torchtrt.Input( - input.shape, dtype=torch.float, format=torch.contiguous_format - ) - ], - "device": torchtrt.Device("cuda:0"), - "enabled_precisions": {torch.float}, - "ir": ir, - "pass_through_build_failures": True, - "optimization_level": 1, - "min_block_size": 8, - "cache_built_engines": False, - "reuse_cached_engines": False, - } - - trt_mod = torchtrt.compile(model, **compile_spec) - cos_sim = cosine_similarity(model(input), trt_mod(input)[0]) - assertions.assertTrue( - cos_sim > COSINE_THRESHOLD, - msg=f"Resnet18 TRT outputs don't match with the original model. Cosine sim score: {cos_sim} Threshold: {COSINE_THRESHOLD}", - ) - - # Clean up model env - torch._dynamo.reset() - - -@pytest.mark.unit -def test_mobilenet_v2(ir): - model = models.mobilenet_v2(pretrained=True).eval().to("cuda") - input = torch.randn((1, 3, 224, 224)).to("cuda") - - compile_spec = { - "inputs": [ - torchtrt.Input( - input.shape, dtype=torch.float, format=torch.contiguous_format - ) - ], - "device": torchtrt.Device("cuda:0"), - "enabled_precisions": {torch.float}, - "ir": ir, - "pass_through_build_failures": True, - "optimization_level": 1, - "min_block_size": 8, - "cache_built_engines": False, - "reuse_cached_engines": False, - } - - trt_mod = torchtrt.compile(model, **compile_spec) - cos_sim = cosine_similarity(model(input), trt_mod(input)[0]) - assertions.assertTrue( - cos_sim > COSINE_THRESHOLD, - msg=f"Mobilenet v2 TRT outputs don't match with the original model. Cosine sim score: {cos_sim} Threshold: {COSINE_THRESHOLD}", - ) - - # Clean up model env - torch._dynamo.reset() - - -@pytest.mark.unit -def test_efficientnet_b0(ir): - model = timm.create_model("efficientnet_b0", pretrained=True).eval().to("cuda") - input = torch.randn((1, 3, 224, 224)).to("cuda") - - compile_spec = { - "inputs": [ - torchtrt.Input( - input.shape, dtype=torch.float, format=torch.contiguous_format - ) - ], - "device": torchtrt.Device("cuda:0"), - "enabled_precisions": {torch.float}, - "ir": ir, - "pass_through_build_failures": True, - "optimization_level": 1, - "min_block_size": 8, - "cache_built_engines": False, - "reuse_cached_engines": False, - } - - trt_mod = torchtrt.compile(model, **compile_spec) - cos_sim = cosine_similarity(model(input), trt_mod(input)[0]) - assertions.assertTrue( - cos_sim > COSINE_THRESHOLD, - msg=f"EfficientNet-B0 TRT outputs don't match with the original model. Cosine sim score: {cos_sim} Threshold: {COSINE_THRESHOLD}", - ) - - # Clean up model env - torch._dynamo.reset() - - -@pytest.mark.unit -@unittest.skipIf( - not importlib.util.find_spec("transformers"), - "transformers is required to run this test", -) -def test_bert_base_uncased(ir): - from transformers import BertModel - - model = ( - BertModel.from_pretrained("bert-base-uncased", return_dict=False).cuda().eval() - ) - input = torch.randint(0, 2, (1, 14), dtype=torch.int32).to("cuda") - input2 = torch.randint(0, 2, (1, 14), dtype=torch.int32).to("cuda") - - compile_spec = { - "inputs": [ - torchtrt.Input( - input.shape, - dtype=input.dtype, - format=torch.contiguous_format, - ), - torchtrt.Input( - input.shape, - dtype=input.dtype, - format=torch.contiguous_format, - ), - ], - "device": torchtrt.Device("cuda:0"), - "enabled_precisions": {torch.float}, - "truncate_double": True, - "ir": ir, - "min_block_size": 10, - "cache_built_engines": False, - "reuse_cached_engines": False, - } - trt_mod = torchtrt.compile(model, **compile_spec) - model_outputs = model(input, input2) - trt_model_outputs = trt_mod(input, input2) - assertions.assertTrue( - len(model_outputs) == len(trt_model_outputs), - msg=f"Number of outputs for BERT model compilation is different with Pytorch {len(model_outputs)} and TensorRT {len(trt_model_outputs)}. Please check the compilation.", - ) - - for index in range(len(model_outputs)): - out, trt_out = model_outputs[index], trt_model_outputs[index] - cos_sim = cosine_similarity(out, trt_out) - assertions.assertTrue( - cos_sim > COSINE_THRESHOLD, - msg=f"HF BERT base-uncased TRT outputs don't match with the original model. Cosine sim score: {cos_sim} Threshold: {COSINE_THRESHOLD}", - ) - - # Clean up model env - torch._dynamo.reset() - - -@pytest.mark.unit -def test_resnet18_half(ir): - model = models.resnet18(pretrained=True).eval().to("cuda").half() - input = torch.randn((1, 3, 224, 224)).to("cuda").half() - - compile_spec = { - "inputs": [ - torchtrt.Input( - input.shape, dtype=torch.half, format=torch.contiguous_format - ) - ], - "device": torchtrt.Device("cuda:0"), - "enabled_precisions": {torch.half}, - "ir": ir, - "pass_through_build_failures": True, - "optimization_level": 1, - "min_block_size": 8, - "cache_built_engines": False, - "reuse_cached_engines": False, - } - - trt_mod = torchtrt.compile(model, **compile_spec) - cos_sim = cosine_similarity(model(input), trt_mod(input)[0]) - assertions.assertTrue( - cos_sim > COSINE_THRESHOLD, - msg=f"Resnet18 Half TRT outputs don't match with the original model. Cosine sim score: {cos_sim} Threshold: {COSINE_THRESHOLD}", - ) - - # Clean up model env - torch._dynamo.reset() From 574b6bd820158b7a6b7bce053b0df9f07a02135a Mon Sep 17 00:00:00 2001 From: Dheeraj Peri Date: Tue, 22 Apr 2025 10:36:48 -0700 Subject: [PATCH 09/13] chore: remove debug logging in CI tests, add missing dtype_support test --- .github/workflows/build-test-linux.yml | 1 + tests/py/dynamo/backend/test_backend_compiler.py | 8 +------- tests/py/dynamo/conversion/harness.py | 2 -- tests/py/dynamo/models/test_dtype_support.py | 1 - tests/py/dynamo/models/test_model_refit.py | 1 - tests/py/dynamo/models/test_modelopt_models.py | 2 -- tests/py/dynamo/runtime/test_002_cudagraphs_py.py | 1 - 7 files changed, 2 insertions(+), 14 deletions(-) diff --git a/.github/workflows/build-test-linux.yml b/.github/workflows/build-test-linux.yml index 9efb2ab146..e740fb1e49 100644 --- a/.github/workflows/build-test-linux.yml +++ b/.github/workflows/build-test-linux.yml @@ -175,6 +175,7 @@ jobs: python -m pytest -ra --junitxml=${RUNNER_TEST_RESULTS_DIR}/dynamo_models.xml --ir dynamo models/test_models.py python -m pytest -ra --junitxml=${RUNNER_TEST_RESULTS_DIR}/dynamo_models_dynamic.xml --ir dynamo models/test_dyn_models.py python -m pytest -ra --junitxml=${RUNNER_TEST_RESULTS_DIR}/engine_cache.xml --ir dynamo models/test_engine_cache.py + python -m pytest -ra --junitxml=${RUNNER_TEST_RESULTS_DIR}/dtype_support.xml --ir dynamo models/test_dtype_support.py python -m pytest -ra --junitxml=${RUNNER_TEST_RESULTS_DIR}/model_refit.xml --ir dynamo models/test_model_refit.py python -m pytest -ra --junitxml=${RUNNER_TEST_RESULTS_DIR}/modelopt_models.xml --ir dynamo models/test_modelopt_models.py python -m pytest -ra --junitxml=${RUNNER_TEST_RESULTS_DIR}/weight_stripped_engine.xml --ir dynamo models/test_weight_stripped_engine.py diff --git a/tests/py/dynamo/backend/test_backend_compiler.py b/tests/py/dynamo/backend/test_backend_compiler.py index 4c65800f05..6369d3805c 100644 --- a/tests/py/dynamo/backend/test_backend_compiler.py +++ b/tests/py/dynamo/backend/test_backend_compiler.py @@ -2,11 +2,10 @@ from copy import deepcopy import torch +import torch_tensorrt from torch.testing._internal.common_utils import TestCase, run_tests from torch_tensorrt.dynamo.partitioning import fast_partition -import torch_tensorrt - from ..testing_utilities import DECIMALS_OF_AGREEMENT, lower_graph_testing @@ -51,7 +50,6 @@ def forward(self, x, y): pass_through_build_failures=True, torch_executed_ops={"torch.ops.aten.add.Tensor"}, use_python_runtime=False, - debug=True, ) optimized_model_results = optimized_model(*inputs).detach().cpu() torch_model_results = fx_graph(*inputs).detach().cpu() @@ -132,7 +130,6 @@ def forward(self, x, y): pass_through_build_failures=True, torch_executed_ops={"torch.ops.aten.add.Tensor"}, use_python_runtime=False, - debug=True, ) optimized_model_results = optimized_model(*inputs).detach().cpu() torch_model_results = model(*inputs).detach().cpu() @@ -177,7 +174,6 @@ def forward(self, x, y): optimization_level=4, version_compatible=True, max_aux_streams=5, - debug=True, ) optimized_model_results = optimized_model(*inputs).detach().cpu() torch_model_results = fx_graph(*inputs).detach().cpu() @@ -225,7 +221,6 @@ def forward(self, x, y): min_block_size=1, pass_through_build_failures=True, truncate_double=True, - debug=True, ) optimized_model_results = optimized_model(*inputs).detach().cpu() torch_model_results = fx_graph(*inputs).detach().cpu() @@ -298,7 +293,6 @@ def forward(self, x, y): min_block_size=1, pass_through_build_failures=True, truncate_double=False, - debug=True, torch_executed_ops={"torch.ops.aten.add.Tensor"}, ) optimized_model_results = optimized_model(*inputs).detach().cpu() diff --git a/tests/py/dynamo/conversion/harness.py b/tests/py/dynamo/conversion/harness.py index 6ff45507a0..aa22a74fc0 100644 --- a/tests/py/dynamo/conversion/harness.py +++ b/tests/py/dynamo/conversion/harness.py @@ -415,7 +415,6 @@ def run_test( compilation_settings = CompilationSettings( enabled_precisions={dtype._from(precision)}, truncate_double=True, - debug=True, immutable_weights=immutable_weights, ) @@ -507,7 +506,6 @@ def run_test_compare_tensor_attributes_only( compilation_settings = CompilationSettings( enabled_precisions={dtype._from(precision)}, truncate_double=True, - debug=True, immutable_weights=immutable_weights, ) diff --git a/tests/py/dynamo/models/test_dtype_support.py b/tests/py/dynamo/models/test_dtype_support.py index 146f7fdb7d..37b40574a1 100644 --- a/tests/py/dynamo/models/test_dtype_support.py +++ b/tests/py/dynamo/models/test_dtype_support.py @@ -297,7 +297,6 @@ def forward(self, x): ir="torch_compile", inputs=inputs, enabled_precisions={torch.bfloat16}, - debug=True, min_block_size=1, device=device, cache_built_engines=False, diff --git a/tests/py/dynamo/models/test_model_refit.py b/tests/py/dynamo/models/test_model_refit.py index d71091b04e..b170bcc47d 100644 --- a/tests/py/dynamo/models/test_model_refit.py +++ b/tests/py/dynamo/models/test_model_refit.py @@ -815,7 +815,6 @@ def forward(self, x): exp_program, tuple(inputs), enabled_precisions={torch.float}, - debug=True, min_block_size=1, immutable_weights=False, ) diff --git a/tests/py/dynamo/models/test_modelopt_models.py b/tests/py/dynamo/models/test_modelopt_models.py index 50ccd2880d..51a74b022a 100644 --- a/tests/py/dynamo/models/test_modelopt_models.py +++ b/tests/py/dynamo/models/test_modelopt_models.py @@ -58,7 +58,6 @@ def calibrate_loop(model): inputs=[input_tensor], enabled_precisions={torch.float8_e4m3fn}, min_block_size=1, - debug=True, cache_built_engines=False, reuse_cached_engines=False, ) @@ -109,7 +108,6 @@ def calibrate_loop(model): inputs=[input_tensor], enabled_precisions={torch.int8}, min_block_size=1, - debug=True, cache_built_engines=False, reuse_cached_engines=False, truncate_double=True, diff --git a/tests/py/dynamo/runtime/test_002_cudagraphs_py.py b/tests/py/dynamo/runtime/test_002_cudagraphs_py.py index 0a4629644d..0c9b8bc13f 100644 --- a/tests/py/dynamo/runtime/test_002_cudagraphs_py.py +++ b/tests/py/dynamo/runtime/test_002_cudagraphs_py.py @@ -61,7 +61,6 @@ def forward(self, x): min_block_size=1, pass_through_build_failures=True, use_python_runtime=True, - debug=True, ) result_samples = [] From 9f32b3d7f1cf7929f7e87a7bfc693c4c125bc003 Mon Sep 17 00:00:00 2001 From: Dheeraj Peri Date: Tue, 22 Apr 2025 11:57:16 -0700 Subject: [PATCH 10/13] chore: updates --- py/torch_tensorrt/dynamo/conversion/impl/quantize.py | 2 ++ tests/py/dynamo/models/test_modelopt_models.py | 3 ++- 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/py/torch_tensorrt/dynamo/conversion/impl/quantize.py b/py/torch_tensorrt/dynamo/conversion/impl/quantize.py index e472ed3092..c23fd55c5a 100644 --- a/py/torch_tensorrt/dynamo/conversion/impl/quantize.py +++ b/py/torch_tensorrt/dynamo/conversion/impl/quantize.py @@ -45,7 +45,9 @@ def quantize( max_bound = 448 amax = to_torch(amax, None) + print(f"=========== AMAX: {amax}") scale = torch.divide(amax, max_bound) + print(f"=========== SCALE: {scale}") scale = get_trt_tensor(ctx, scale, name + "_scale") # Add Q node quantize_layer = ctx.net.add_quantize(input_tensor, scale) diff --git a/tests/py/dynamo/models/test_modelopt_models.py b/tests/py/dynamo/models/test_modelopt_models.py index 51a74b022a..36dce5af32 100644 --- a/tests/py/dynamo/models/test_modelopt_models.py +++ b/tests/py/dynamo/models/test_modelopt_models.py @@ -100,7 +100,7 @@ def calibrate_loop(model): # model has INT8 qdq nodes at this point output_pyt = model(input_tensor) - with torch.no_grad(): + with torchtrt.logging.debug(), torch.no_grad(): with export_torch_mode(): exp_program = torch.export.export(model, (input_tensor,)) trt_model = torchtrt.dynamo.compile( @@ -111,6 +111,7 @@ def calibrate_loop(model): cache_built_engines=False, reuse_cached_engines=False, truncate_double=True, + debug=True, ) outputs_trt = trt_model(input_tensor) assert torch.allclose(output_pyt, outputs_trt, rtol=5e-3, atol=1e-2) From 8c97e337b8e4fdbf528aa003f57bfb6b85c341dc Mon Sep 17 00:00:00 2001 From: Dheeraj Peri Date: Tue, 22 Apr 2025 14:42:44 -0700 Subject: [PATCH 11/13] chore: bug fixes --- py/torch_tensorrt/dynamo/conversion/_TRTInterpreter.py | 2 +- py/torch_tensorrt/dynamo/conversion/impl/quantize.py | 2 -- tests/py/dynamo/models/test_modelopt_models.py | 4 ++-- tests/py/requirements.txt | 2 +- 4 files changed, 4 insertions(+), 6 deletions(-) diff --git a/py/torch_tensorrt/dynamo/conversion/_TRTInterpreter.py b/py/torch_tensorrt/dynamo/conversion/_TRTInterpreter.py index fde07bf1f5..5bd3efd35c 100644 --- a/py/torch_tensorrt/dynamo/conversion/_TRTInterpreter.py +++ b/py/torch_tensorrt/dynamo/conversion/_TRTInterpreter.py @@ -898,7 +898,7 @@ def get_attr(self, target: str, args: Any, kwargs: Any) -> np.ndarray: else: constant_tensor = frozen_attr - return to_torch(constant_tensor) + return to_torch(constant_tensor) def call_method(self, target: str, args: Any, kwargs: Any) -> Any: assert isinstance(target, str) diff --git a/py/torch_tensorrt/dynamo/conversion/impl/quantize.py b/py/torch_tensorrt/dynamo/conversion/impl/quantize.py index c23fd55c5a..e472ed3092 100644 --- a/py/torch_tensorrt/dynamo/conversion/impl/quantize.py +++ b/py/torch_tensorrt/dynamo/conversion/impl/quantize.py @@ -45,9 +45,7 @@ def quantize( max_bound = 448 amax = to_torch(amax, None) - print(f"=========== AMAX: {amax}") scale = torch.divide(amax, max_bound) - print(f"=========== SCALE: {scale}") scale = get_trt_tensor(ctx, scale, name + "_scale") # Add Q node quantize_layer = ctx.net.add_quantize(input_tensor, scale) diff --git a/tests/py/dynamo/models/test_modelopt_models.py b/tests/py/dynamo/models/test_modelopt_models.py index 36dce5af32..c2cd719bf9 100644 --- a/tests/py/dynamo/models/test_modelopt_models.py +++ b/tests/py/dynamo/models/test_modelopt_models.py @@ -68,7 +68,7 @@ def calibrate_loop(model): @unittest.skipIf( platform.system() != "Linux" or not importlib.util.find_spec("modelopt") - or Version(metadata.version("nvidia-modelopt")) < Version("0.17.0"), + or Version(metadata.version("nvidia-modelopt")) < Version("0.27.0"), "modelopt 0.17.0 or later is required, Int8 quantization is supported in modelopt since 0.17.0 or later for linux", ) @pytest.mark.unit @@ -102,7 +102,7 @@ def calibrate_loop(model): with torchtrt.logging.debug(), torch.no_grad(): with export_torch_mode(): - exp_program = torch.export.export(model, (input_tensor,)) + exp_program = torch.export.export(model, (input_tensor,), strict=False) trt_model = torchtrt.dynamo.compile( exp_program, inputs=[input_tensor], diff --git a/tests/py/requirements.txt b/tests/py/requirements.txt index 4f3c4e083b..1925ef5839 100644 --- a/tests/py/requirements.txt +++ b/tests/py/requirements.txt @@ -9,5 +9,5 @@ pytest-xdist>=3.6.1 pyyaml timm>=1.0.3 transformers==4.49.0 -nvidia-modelopt[deploy,hf,torch]~=0.17.0; python_version < "3.13" +nvidia-modelopt[deploy,hf,torch]~=0.27.0; python_version < "3.13" --extra-index-url https://pypi.nvidia.com From d27759b9946e935e89d26fa1cb99ebc3533950de Mon Sep 17 00:00:00 2001 From: Dheeraj Peri Date: Tue, 29 Apr 2025 12:17:30 -0700 Subject: [PATCH 12/13] chore: disable flashinfer rmsnorm test --- .../test_flashinfer_rmsnorm.py | 34 ++++++++++--------- 1 file changed, 18 insertions(+), 16 deletions(-) diff --git a/tests/py/dynamo/automatic_plugin/test_flashinfer_rmsnorm.py b/tests/py/dynamo/automatic_plugin/test_flashinfer_rmsnorm.py index cf803c5ffa..fd5ed390ff 100644 --- a/tests/py/dynamo/automatic_plugin/test_flashinfer_rmsnorm.py +++ b/tests/py/dynamo/automatic_plugin/test_flashinfer_rmsnorm.py @@ -1,8 +1,6 @@ -import pytest - -flashinfer = pytest.importorskip("flashinfer") import unittest +import pytest import torch import torch.nn as nn import torch_tensorrt @@ -12,25 +10,29 @@ from ..conversion.harness import DispatchTestCase +# Toggle this flag to enable/disable flashinfer-based overrides +enable_flashinfer: bool = False +if enable_flashinfer: + import flashinfer -@torch.library.custom_op("flashinfer::rmsnorm", mutates_args=()) # type: ignore[misc] -def flashinfer_rmsnorm( - input: torch.Tensor, weight: torch.Tensor, eps: float = 1e-6 -) -> torch.Tensor: - return flashinfer.norm.rmsnorm(input, weight) + @torch.library.custom_op("flashinfer::rmsnorm", mutates_args=()) # type: ignore[misc] + def flashinfer_rmsnorm( + input: torch.Tensor, weight: torch.Tensor, eps: float = 1e-6 + ) -> torch.Tensor: + return flashinfer.norm.rmsnorm(input, weight) + @torch.library.register_fake("flashinfer::rmsnorm") + def _(input: torch.Tensor, weight: torch.Tensor, b: float = 1e-6) -> torch.Tensor: + return input -@torch.library.register_fake("flashinfer::rmsnorm") -def _(input: torch.Tensor, weight: torch.Tensor, b: float = 1e-6) -> torch.Tensor: - return input + torch_tensorrt.dynamo.conversion.plugins.custom_op( + "flashinfer::rmsnorm", supports_dynamic_shapes=True + ) -torch_tensorrt.dynamo.conversion.plugins.custom_op( - "flashinfer::rmsnorm", supports_dynamic_shapes=True +@unittest.skip( + "Flashinfer RMSNorm test is disabled due to error: SM75 support not available" ) - - -@unittest.skip("Not Available") class TestAutomaticPlugin(DispatchTestCase): @parameterized.expand( [ From 35defa84a94e3ae3fef593f8fb9c1c2089321bdb Mon Sep 17 00:00:00 2001 From: Dheeraj Peri Date: Tue, 29 Apr 2025 13:49:29 -0700 Subject: [PATCH 13/13] chore: fix modelopt onnx dep --- tests/py/requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/py/requirements.txt b/tests/py/requirements.txt index 1925ef5839..94db519d28 100644 --- a/tests/py/requirements.txt +++ b/tests/py/requirements.txt @@ -9,5 +9,5 @@ pytest-xdist>=3.6.1 pyyaml timm>=1.0.3 transformers==4.49.0 -nvidia-modelopt[deploy,hf,torch]~=0.27.0; python_version < "3.13" +nvidia-modelopt[all]~=0.27.0; python_version < "3.13" --extra-index-url https://pypi.nvidia.com