From 97273d44372f6388f3c9d7497f1debd6c669d32a Mon Sep 17 00:00:00 2001
From: Dheeraj Peri <peri.dheeraj@gmail.com>
Date: Tue, 1 Apr 2025 01:04:15 -0700
Subject: [PATCH 01/13] fix: Change the translational layer from numpy to torch
 during conversion to handle additional data types (#3445)

---
 py/torch_tensorrt/dynamo/_refit.py            | 122 +++++++++---------
 .../dynamo/conversion/_TRTInterpreter.py      |  40 +++---
 .../dynamo/conversion/converter_utils.py      | 103 +++++++++++++--
 .../dynamo/conversion/impl/conv.py            |  14 +-
 .../dynamo/conversion/impl/deconv.py          |  14 +-
 .../dynamo/conversion/impl/quantize.py        |  82 ++++++------
 tests/py/dynamo/models/test_models.py         |  91 +++++++++++++
 tests/py/dynamo/models/test_models_export.py  |   5 +-
 8 files changed, 325 insertions(+), 146 deletions(-)

diff --git a/py/torch_tensorrt/dynamo/_refit.py b/py/torch_tensorrt/dynamo/_refit.py
index 96fc6daad2..c128e9cc82 100644
--- a/py/torch_tensorrt/dynamo/_refit.py
+++ b/py/torch_tensorrt/dynamo/_refit.py
@@ -9,6 +9,7 @@
 import tensorrt as trt
 import torch
 from torch.export import ExportedProgram
+from torch.fx.experimental.proxy_tensor import unset_fake_temporarily
 from torch_tensorrt._enums import dtype
 from torch_tensorrt._Input import Input
 from torch_tensorrt.dynamo import partitioning
@@ -144,71 +145,72 @@ def _refit_single_trt_engine_with_gm(
     Refit a TensorRT Engine in place
     """
 
-    refitted = set()
-    torch_device = get_model_device(new_gm)
-    refitter = trt.Refitter(old_engine, TRT_LOGGER)
-    weight_list = refitter.get_all_weights()
-
-    if weight_name_map:
-        # Get the refitting mapping
-        trt_wt_location = (
-            trt.TensorLocation.DEVICE
-            if torch_device.type == "cuda"
-            else trt.TensorLocation.HOST
-        )
+    with unset_fake_temporarily():
+        refitted = set()
+        torch_device = get_model_device(new_gm)
+        refitter = trt.Refitter(old_engine, TRT_LOGGER)
+        weight_list = refitter.get_all_weights()
+
+        if weight_name_map:
+            # Get the refitting mapping
+            trt_wt_location = (
+                trt.TensorLocation.DEVICE
+                if torch_device.type == "cuda"
+                else trt.TensorLocation.HOST
+            )
 
-        constant_mapping: dict[str, Any] = weight_name_map.pop(
-            "constant_mapping", {}
-        )  # type: ignore
-        mapping = construct_refit_mapping_from_weight_name_map(
-            weight_name_map, new_gm.state_dict()
-        )
-        constant_mapping_with_type = {}
-
-        for constant_name, val in constant_mapping.items():
-            np_weight_type = val.dtype
-            val_tensor = torch.from_numpy(val).cuda()
-            trt_dtype = dtype.try_from(np_weight_type).to(trt.DataType)
-            torch_dtype = dtype.try_from(np_weight_type).to(torch.dtype)
-            constant_mapping_with_type[constant_name] = (
-                val_tensor.clone().reshape(-1).contiguous().to(torch_dtype),
-                trt_dtype,
+            constant_mapping: dict[str, Any] = weight_name_map.pop(
+                "constant_mapping", {}
+            )  # type: ignore
+            mapping = construct_refit_mapping_from_weight_name_map(
+                weight_name_map, new_gm.state_dict()
             )
+            constant_mapping_with_type = {}
+
+            for constant_name, val in constant_mapping.items():
+                np_weight_type = val.dtype
+                val_tensor = torch.from_numpy(val).cuda()
+                trt_dtype = dtype.try_from(np_weight_type).to(trt.DataType)
+                torch_dtype = dtype.try_from(np_weight_type).to(torch.dtype)
+                constant_mapping_with_type[constant_name] = (
+                    val_tensor.clone().reshape(-1).contiguous().to(torch_dtype),
+                    trt_dtype,
+                )
 
-        mapping.update(constant_mapping_with_type)
+            mapping.update(constant_mapping_with_type)
 
-        for layer_name in weight_list:
-            if layer_name not in mapping:
-                logger.warning(f"{layer_name} is not found in weight mapping.")
-                continue
-            # Use Numpy to create weights
-            weight, weight_dtype = mapping[layer_name]
-            trt_wt_tensor = trt.Weights(
-                weight_dtype, weight.data_ptr(), torch.numel(weight)
-            )
-            refitter.set_named_weights(layer_name, trt_wt_tensor, trt_wt_location)
-        assert (
-            len(refitter.get_missing_weights()) == 0
-        ), "Fast refitting failed due to incomplete mapping"
+            for layer_name in weight_list:
+                if layer_name not in mapping:
+                    logger.warning(f"{layer_name} is not found in weight mapping.")
+                    continue
+                # Use Numpy to create weights
+                weight, weight_dtype = mapping[layer_name]
+                trt_wt_tensor = trt.Weights(
+                    weight_dtype, weight.data_ptr(), torch.numel(weight)
+                )
+                refitter.set_named_weights(layer_name, trt_wt_tensor, trt_wt_location)
+            assert (
+                len(refitter.get_missing_weights()) == 0
+            ), "Fast refitting failed due to incomplete mapping"
 
-    else:
-        mapping = construct_refit_mapping(new_gm, input_list, settings)
-        trt_wt_location = trt.TensorLocation.HOST
-        for layer_name in weight_list:
-            if layer_name not in mapping:
-                raise AssertionError(f"{layer_name} is not found in weight mapping")
-            # Use Numpy to create weights
-            weight, datatype = mapping[layer_name]
-            trt_wt_tensor = trt.Weights(datatype, weight.ctypes.data, weight.size)
-            refitter.set_named_weights(layer_name, trt_wt_tensor, trt_wt_location)
-            refitted.add(layer_name)
-
-        if len(refitted) != len(weight_list):
-            logger.warning("Not all weights have been refitted!!!")
-
-    if not refitter.refit_cuda_engine():
-        logger.error("Error: failed to refit new weights.")
-        raise AssertionError("Refitting failed.")
+        else:
+            mapping = construct_refit_mapping(new_gm, input_list, settings)
+            trt_wt_location = trt.TensorLocation.HOST
+            for layer_name in weight_list:
+                if layer_name not in mapping:
+                    raise AssertionError(f"{layer_name} is not found in weight mapping")
+                # Use Numpy to create weights
+                weight, datatype = mapping[layer_name]
+                trt_wt_tensor = trt.Weights(datatype, weight.ctypes.data, weight.size)
+                refitter.set_named_weights(layer_name, trt_wt_tensor, trt_wt_location)
+                refitted.add(layer_name)
+
+            if len(refitted) != len(weight_list):
+                logger.warning("Not all weights have been refitted!!!")
+
+        if not refitter.refit_cuda_engine():
+            logger.error("Error: failed to refit new weights.")
+            raise AssertionError("Refitting failed.")
 
 
 def refit_module_weights(
diff --git a/py/torch_tensorrt/dynamo/conversion/_TRTInterpreter.py b/py/torch_tensorrt/dynamo/conversion/_TRTInterpreter.py
index 248e06bc3c..17f2fccbff 100644
--- a/py/torch_tensorrt/dynamo/conversion/_TRTInterpreter.py
+++ b/py/torch_tensorrt/dynamo/conversion/_TRTInterpreter.py
@@ -21,6 +21,7 @@
 import tensorrt as trt
 import torch
 import torch.fx
+from torch.fx.experimental.proxy_tensor import unset_fake_temporarily
 from torch.fx.node import _get_qualified_name
 from torch.fx.passes.shape_prop import TensorMetadata
 from torch.utils._python_dispatch import _disable_current_modes
@@ -41,6 +42,7 @@
     get_node_io,
     get_node_name,
     get_trt_tensor,
+    to_torch,
 )
 from torch_tensorrt.dynamo.utils import DYNAMIC_DIM, get_model_device, to_torch_device
 from torch_tensorrt.fx.observer import Observer
@@ -408,12 +410,13 @@ def find_weight(
         np_map: the map from weight name to np values in INetworkDefinition
         state_dict: state of the graph module
         """
-        network_weight = torch.from_numpy(np_map[weight_name]).to(device)
-        for sd_w_name, sd_weight in state_dict.items():
-            if TRTInterpreter.check_weight_equal(sd_weight, network_weight, device):
-                del state_dict[sd_w_name]
-                return sd_w_name
-        return ""
+        with unset_fake_temporarily():
+            network_weight = torch.from_numpy(np_map[weight_name]).to(device)
+            for sd_w_name, sd_weight in state_dict.items():
+                if TRTInterpreter.check_weight_equal(sd_weight, network_weight, device):
+                    del state_dict[sd_w_name]
+                    return sd_w_name
+            return ""
 
     @staticmethod
     def check_weight_equal(
@@ -421,14 +424,15 @@ def check_weight_equal(
         network_weight: Union[torch.Tensor, np.ndarray],
         device: torch.device,
     ) -> Any:
-        if not isinstance(network_weight, torch.Tensor):
-            network_weight = torch.from_numpy(network_weight).to(device)
-        try:
-            return sd_weight.shape == network_weight.shape and torch.all(
-                torch.abs(sd_weight - network_weight) < 0.01
-            )
-        except Exception:
-            return torch.all(sd_weight == network_weight)
+        with unset_fake_temporarily():
+            if not isinstance(network_weight, torch.Tensor):
+                network_weight = torch.from_numpy(network_weight).to(device)
+            try:
+                return sd_weight.shape == network_weight.shape and torch.all(
+                    torch.abs(sd_weight - network_weight) < 0.01
+                )
+            except Exception:
+                return torch.all(sd_weight == network_weight)
 
     def _save_weight_mapping(self) -> None:
         """
@@ -887,9 +891,7 @@ def call_function(self, target: str, args: Any, kwargs: Any) -> Any:
             return converter(self.ctx, target, args, kwargs, self._cur_node_name)
 
     def get_attr(self, target: str, args: Any, kwargs: Any) -> np.ndarray:
-        with _disable_current_modes():
-            from torch_tensorrt.dynamo.conversion.converter_utils import to_numpy
-
+        with _disable_current_modes(), unset_fake_temporarily():
             frozen_attr = self.fetch_attr(target)
 
             if isinstance(frozen_attr, torch.nn.Parameter):
@@ -897,9 +899,7 @@ def get_attr(self, target: str, args: Any, kwargs: Any) -> np.ndarray:
             else:
                 constant_tensor = frozen_attr
 
-            network_constant = to_numpy(constant_tensor)
-
-        return network_constant
+        return to_torch(constant_tensor)
 
     def call_method(self, target: str, args: Any, kwargs: Any) -> Any:
         assert isinstance(target, str)
diff --git a/py/torch_tensorrt/dynamo/conversion/converter_utils.py b/py/torch_tensorrt/dynamo/conversion/converter_utils.py
index 62526080c4..bcb8495c67 100644
--- a/py/torch_tensorrt/dynamo/conversion/converter_utils.py
+++ b/py/torch_tensorrt/dynamo/conversion/converter_utils.py
@@ -9,6 +9,7 @@
 import tensorrt as trt
 import torch
 import torch_tensorrt.dynamo.conversion.impl as impl
+from torch.fx.experimental.proxy_tensor import unset_fake_temporarily
 from torch.fx.node import Argument, Target
 from torch.fx.passes.shape_prop import TensorMetadata
 from torch_tensorrt import _enums
@@ -340,17 +341,47 @@ def create_constant(
     Returns:
         A TensorRT ITensor that represents the given value.
     """
-    shape = (1,)
-    # Rank 0 constant is required in IFillLayer inputs.
-    if min_rank == 0:
-        shape = trt.Dims()
-    numpy_value = to_numpy(value, dtype)
-    constant = ctx.net.add_constant(
-        shape if isinstance(value, (int, float, bool)) else value.shape,
-        numpy_value.copy() if isinstance(numpy_value, np.ndarray) else numpy_value,
-    )
-    constant.name = name
-    return constant.get_output(0)
+    with unset_fake_temporarily():
+
+        torch_value = to_torch(value, dtype)
+        if torch_value.dtype == torch.float64:
+            raise ValueError(
+                "TensorRT does not support float64 (double) precision. To resolve this, please set truncate_double=True in your compilation settings and re-run the model."
+            )
+        # Rank 0 constant is required in IFillLayer inputs.
+        if min_rank == 0 and isinstance(value, (int, float, bool)):
+            shape = trt.Dims()
+        elif list(torch_value.shape) == []:
+            shape = trt.Dims()
+        else:
+            shape = list(torch_value.shape)
+
+        if torch_value is not None:
+            if torch_value.dtype == torch.bfloat16:
+                torch_value_fp32 = torch_value.to(torch.float32)
+                numpy_value = torch_value_fp32.numpy()
+            else:
+                numpy_value = torch_value.numpy()
+
+            constant = ctx.net.add_constant(
+                shape,
+                numpy_value,
+            )
+            constant.name = name
+
+            if torch_value.dtype == torch.bfloat16:
+                return cast_trt_tensor(
+                    ctx,
+                    constant.get_output(0),
+                    trt.DataType.BF16,
+                    name + "_bf16_cast",
+                )
+
+            return constant.get_output(0)
+        else:
+            raise ValueError(
+                f"Cannot convert tensor '{name}' to a TensorRT constant because its value is None."
+            )
 
 
 def get_trt_tensor(
@@ -564,6 +595,9 @@ def to_numpy(
             value = value.dequantize()
         elif value.dtype == torch.bfloat16:
             # TODO: Remove when numpy has a BF16 type
+            _LOGGER.warning(
+                "Requested a conversion of bfloat16 tensor from torch to numpy which isn't supported. Casting this tensor to FP32 precision currently. Please use to_torch() API for better data representation",
+            )
             value = value.to(torch.float)
 
         output = value.cpu().detach().contiguous().numpy()
@@ -589,6 +623,53 @@ def to_numpy(
         )
 
 
+def to_torch(
+    value: Optional[Union[torch.Tensor, np.ndarray, int, float, bool]],
+    dtype: Optional[Union[torch.dtype, np.dtype, TRTDataType, _enums.dtype]] = None,
+) -> Optional[torch.Tensor]:
+    """
+    Convert a Numpy array, or scalar to a PyTorch tensor and move it to CPU
+    Args:
+        value (Optional[Union[torch.Tensor, np.ndarray, int, float, bool]]):
+            A PyTorch tensor, Numpy array, int, float, or bool
+        dtype (Optional[Union[torch.dtype, np.dtype, TRTDataType]]):
+            If a dtype is given, we will convert the type of the given `value` to this dtype.
+    Returns:
+        A PyTorch tensor or None, if the input was None.
+    """
+
+    cpu_device = torch.device("cpu")
+    torch_dtype = (
+        _enums.dtype._from(dtype).to(torch.dtype, use_default=True) if dtype else None
+    )
+
+    with unset_fake_temporarily():
+        if value is None:
+            return None
+
+        elif isinstance(value, torch.Tensor):
+            output = value.to(cpu_device).contiguous()
+
+        elif isinstance(value, np.ndarray):
+            output = torch.from_numpy(value).to(cpu_device).contiguous()
+
+        elif isinstance(value, int):
+            output = torch.tensor([value], device=cpu_device, dtype=torch.int32)
+
+        elif isinstance(value, float):
+            output = torch.tensor([value], device=cpu_device, dtype=torch.float32)
+
+        elif isinstance(value, bool):
+            output = torch.tensor([value], device=cpu_device, dtype=torch.bool)
+
+        else:
+            raise AssertionError(
+                f"to_torch can only be called on None, bool, int, float, np.ndarray, or torch.Tensor, got an object of type: {type(value)}"
+            )
+
+        return output.to(torch_dtype) if torch_dtype else output
+
+
 def flatten_dims(
     input: Sequence[Union[TRTTensor, torch.Tensor, np.ndarray]],
     start_dim: int,
diff --git a/py/torch_tensorrt/dynamo/conversion/impl/conv.py b/py/torch_tensorrt/dynamo/conversion/impl/conv.py
index 25419d7f60..f27fb13e97 100644
--- a/py/torch_tensorrt/dynamo/conversion/impl/conv.py
+++ b/py/torch_tensorrt/dynamo/conversion/impl/conv.py
@@ -13,7 +13,7 @@
     cast_trt_tensor,
     extend_attr_to_tuple,
     get_trt_tensor,
-    to_numpy,
+    to_torch,
 )
 from torch_tensorrt.fx.converters.converter_utils import (
     get_dyn_range,
@@ -45,7 +45,6 @@ def convNd(
         assert input.shape[1] != -1, "Channel dim can't be dynamic for convolution."
 
     num_dims = len(input.shape) - 2
-
     if is_conv1d:
         # Apply an unsqueeze operation to transform the conv1d problem into conv2d
         input = impl.unsqueeze.unsqueeze(
@@ -54,8 +53,8 @@ def convNd(
 
     # Process bias terms
     if isinstance(bias, (torch.Tensor, np.ndarray)):
-        # Transform the bias constant into a Numpy array
-        bias = to_numpy(bias, dtype=input.dtype)
+        bias = to_torch(bias, dtype=input.dtype)
+        bias = get_trt_tensor(ctx, bias, f"{name}_bias")
 
     elif isinstance(bias, TRTTensor):
         bias = get_trt_tensor(ctx, bias, f"{name}_bias")
@@ -74,12 +73,11 @@ def convNd(
                 ctx, target, source_ir, weight.name + "_unsqueeze_conv1d", weight, -1
             )
     elif isinstance(weight, (torch.Tensor, np.ndarray)):
-        # Transform the weight constant into a Numpy array
-        weight = to_numpy(weight, dtype=input.dtype)
-
+        weight = to_torch(weight, dtype=input.dtype)
         # Append new dimension (unsqueeze) if the convolution is 1d
         if is_conv1d:
-            weight = np.expand_dims(weight, -1)
+            weight = torch.unsqueeze(weight, -1)
+        weight = get_trt_tensor(ctx, weight, f"{name}_weight")
 
     else:
         raise RuntimeError(
diff --git a/py/torch_tensorrt/dynamo/conversion/impl/deconv.py b/py/torch_tensorrt/dynamo/conversion/impl/deconv.py
index d19a92e646..629cecf5db 100644
--- a/py/torch_tensorrt/dynamo/conversion/impl/deconv.py
+++ b/py/torch_tensorrt/dynamo/conversion/impl/deconv.py
@@ -6,13 +6,12 @@
 import tensorrt as trt
 import torch
 from torch.fx.node import Target
-
 from torch_tensorrt.dynamo.conversion import impl
 from torch_tensorrt.dynamo.conversion._ConversionContext import ConversionContext
 from torch_tensorrt.dynamo.conversion.converter_utils import (
     extend_attr_to_tuple,
     get_trt_tensor,
-    to_numpy,
+    to_torch,
 )
 from torch_tensorrt.fx.converters.converter_utils import (
     SourceIR,
@@ -53,7 +52,8 @@ def deconvNd(
     # Process bias terms
     if isinstance(bias, (torch.Tensor, np.ndarray)):
         # Transform the bias constant into a Numpy array
-        bias = to_numpy(bias)
+        bias = to_torch(bias, dtype=input.dtype)
+        bias = get_trt_tensor(ctx, bias, f"{name}_bias")
 
     elif isinstance(bias, TRTTensor):
         bias = get_trt_tensor(ctx, bias, f"{name}_bias")
@@ -73,12 +73,12 @@ def deconvNd(
             )
 
     elif isinstance(weight, (torch.Tensor, np.ndarray)):
-        # Transform the weight constant into a Numpy array
-        weight = to_numpy(weight)
-
+        weight = to_torch(weight, dtype=input.dtype)
         # Append new dimension (unsqueeze) if the deconvolution is 1d
         if is_deconv1d:
-            weight = np.expand_dims(weight, axis=-1)
+            weight = torch.unsqueeze(weight, -1)
+
+        weight = get_trt_tensor(ctx, weight, f"{name}_weight")
 
     else:
         raise RuntimeError(
diff --git a/py/torch_tensorrt/dynamo/conversion/impl/quantize.py b/py/torch_tensorrt/dynamo/conversion/impl/quantize.py
index b97840cd09..e472ed3092 100644
--- a/py/torch_tensorrt/dynamo/conversion/impl/quantize.py
+++ b/py/torch_tensorrt/dynamo/conversion/impl/quantize.py
@@ -1,11 +1,13 @@
-from typing import Optional
+from typing import Optional, Union
 
 import numpy as np
 import tensorrt as trt
+import torch
+from torch.fx.experimental.proxy_tensor import unset_fake_temporarily
 from torch.fx.node import Target
 from torch_tensorrt.dynamo._SourceIR import SourceIR
 from torch_tensorrt.dynamo.conversion._ConversionContext import ConversionContext
-from torch_tensorrt.dynamo.conversion.converter_utils import get_trt_tensor
+from torch_tensorrt.dynamo.conversion.converter_utils import get_trt_tensor, to_torch
 from torch_tensorrt.fx.converters.converter_utils import set_layer_name
 from torch_tensorrt.fx.types import TRTTensor
 
@@ -16,7 +18,7 @@ def quantize(
     source_ir: Optional[SourceIR],
     name: str,
     input_tensor: TRTTensor,
-    amax: np.ndarray,
+    amax: Union[np.ndarray, torch.Tensor],
     num_bits: int,
     exponent_bits: int,
 ) -> TRTTensor:
@@ -24,40 +26,44 @@ def quantize(
     Adds quantize and dequantize ops (QDQ) which quantize to INT8 or FP8 based
     on the output_type set and dequantizes them back.
     """
-    if isinstance(input_tensor, TRTTensor) and input_tensor.dtype not in (
-        trt.float32,
-        trt.float16,
-    ):
-        raise ValueError(
-            f"quantize converter received an input of {input_tensor.dtype} type. Supported types: float32 | float16"
-        )
-    if num_bits != 8 or exponent_bits not in (0, 4):
-        raise ValueError(
-            f"quantize converter currently only accept INT8 or FP8 based quantize, got {num_bits=}, {exponent_bits=}"
-        )
-    if num_bits == 8 and exponent_bits == 0:
-        max_bound = 127
-    elif num_bits == 8 and exponent_bits == 4:
-        max_bound = 448
-    scale = np.divide(amax, max_bound)
-    scale = get_trt_tensor(ctx, scale, name + "_scale")
-    # Add Q node
-    quantize_layer = ctx.net.add_quantize(input_tensor, scale)
-    if num_bits == 8 and exponent_bits == 0:
-        quantize_layer.set_output_type(0, trt.DataType.INT8)
-    elif num_bits == 8 and exponent_bits == 4:
-        quantize_layer.set_output_type(0, trt.DataType.FP8)
 
-    set_layer_name(quantize_layer, target, name + "_quantize", source_ir)
-    q_output = quantize_layer.get_output(0)
-    # Add DQ node
-    dequantize_layer = ctx.net.add_dequantize(q_output, scale)
-    set_layer_name(dequantize_layer, target, name + "_dequantize", source_ir)
-    if num_bits == 8 and exponent_bits == 0:
-        dequantize_layer.precision = trt.DataType.INT8
-    elif num_bits == 8 and exponent_bits == 4:
-        # Set DQ layer precision to FP8
-        dequantize_layer.precision = trt.DataType.FP8
-    dq_output = dequantize_layer.get_output(0)
+    with unset_fake_temporarily():
+        if isinstance(input_tensor, TRTTensor) and input_tensor.dtype not in (
+            trt.float32,
+            trt.float16,
+        ):
+            raise ValueError(
+                f"quantize converter received an input of {input_tensor.dtype} type. Supported types: float32 | float16"
+            )
+            if num_bits != 8 or exponent_bits not in (0, 4):
+                raise ValueError(
+                    f"quantize converter currently only accept INT8 or FP8 based quantize, got {num_bits=}, {exponent_bits=}"
+                )
+        if num_bits == 8 and exponent_bits == 0:
+            max_bound = 127
+        elif num_bits == 8 and exponent_bits == 4:
+            max_bound = 448
 
-    return dq_output
+        amax = to_torch(amax, None)
+        scale = torch.divide(amax, max_bound)
+        scale = get_trt_tensor(ctx, scale, name + "_scale")
+        # Add Q node
+        quantize_layer = ctx.net.add_quantize(input_tensor, scale)
+        if num_bits == 8 and exponent_bits == 0:
+            quantize_layer.set_output_type(0, trt.DataType.INT8)
+        elif num_bits == 8 and exponent_bits == 4:
+            quantize_layer.set_output_type(0, trt.DataType.FP8)
+
+        set_layer_name(quantize_layer, target, name + "_quantize", source_ir)
+        q_output = quantize_layer.get_output(0)
+        # Add DQ node
+        dequantize_layer = ctx.net.add_dequantize(q_output, scale)
+        set_layer_name(dequantize_layer, target, name + "_dequantize", source_ir)
+        if num_bits == 8 and exponent_bits == 0:
+            dequantize_layer.precision = trt.DataType.INT8
+        elif num_bits == 8 and exponent_bits == 4:
+            # Set DQ layer precision to FP8
+            dequantize_layer.precision = trt.DataType.FP8
+        dq_output = dequantize_layer.get_output(0)
+
+        return dq_output
diff --git a/tests/py/dynamo/models/test_models.py b/tests/py/dynamo/models/test_models.py
index b6f986711a..6314baa5ec 100644
--- a/tests/py/dynamo/models/test_models.py
+++ b/tests/py/dynamo/models/test_models.py
@@ -182,3 +182,94 @@ def test_resnet18_half(ir):
 
     # Clean up model env
     torch._dynamo.reset()
+
+
+@pytest.mark.unit
+def test_bf16_model(ir):
+    class MyModule(torch.nn.Module):
+        def __init__(self):
+            super().__init__()
+            self.conv = torch.nn.Conv2d(3, 16, 3, stride=1, bias=True)
+            self.relu = torch.nn.ReLU()
+
+        def forward(self, x):
+            out = self.conv(x)
+            out = self.relu(out)
+            return out
+
+    model = MyModule().eval().cuda().to(torch.bfloat16)
+    input = torch.randn((1, 3, 224, 224)).to("cuda").to(torch.bfloat16)
+
+    compile_spec = {
+        "inputs": [
+            torchtrt.Input(
+                input.shape, dtype=torch.bfloat16, format=torch.contiguous_format
+            )
+        ],
+        "device": torchtrt.Device("cuda:0"),
+        "enabled_precisions": {torch.float32},
+        "ir": ir,
+        "pass_through_build_failures": True,
+        "min_block_size": 1,
+        "cache_built_engines": False,
+        "reuse_cached_engines": False,
+        "use_explicit_typing": True,
+    }
+
+    trt_mod = torchtrt.compile(model, **compile_spec)
+    cos_sim = cosine_similarity(model(input), trt_mod(input))
+
+    assertions.assertTrue(
+        cos_sim > COSINE_THRESHOLD,
+        msg=f"BF16 model TRT outputs don't match with the original model. Cosine sim score: {cos_sim} Threshold: {COSINE_THRESHOLD}",
+    )
+
+    # Clean up model env
+    torch._dynamo.reset()
+
+
+@pytest.mark.unit
+def test_bf16_fallback_model(ir):
+    class MyModule(torch.nn.Module):
+        def __init__(self):
+            super().__init__()
+            self.conv = torch.nn.Conv2d(3, 16, 3, padding=1, stride=1, bias=True)
+            self.relu = torch.nn.ReLU()
+            self.conv2 = torch.nn.Conv2d(16, 16, 3, padding=1, stride=1, bias=True)
+
+        def forward(self, x):
+            out = self.conv(x)
+            out = self.relu(out)
+            out = self.conv2(out)
+            return out
+
+    model = MyModule().eval().cuda().to(torch.bfloat16)
+    input = torch.randn((1, 3, 224, 224)).to("cuda").to(torch.bfloat16)
+
+    compile_spec = {
+        "inputs": [
+            torchtrt.Input(
+                input.shape, dtype=torch.bfloat16, format=torch.contiguous_format
+            )
+        ],
+        "device": torchtrt.Device("cuda:0"),
+        "enabled_precisions": {torch.float32},
+        "ir": ir,
+        "pass_through_build_failures": True,
+        "min_block_size": 1,
+        "cache_built_engines": False,
+        "reuse_cached_engines": False,
+        "use_explicit_typing": True,
+        "torch_executed_ops": {"torch.ops.aten.relu.default"},
+    }
+
+    trt_mod = torchtrt.compile(model, **compile_spec)
+    cos_sim = cosine_similarity(model(input), trt_mod(input))
+
+    assertions.assertTrue(
+        cos_sim > COSINE_THRESHOLD,
+        msg=f"BF16 fallback model TRT outputs don't match with the original model. Cosine sim score: {cos_sim} Threshold: {COSINE_THRESHOLD}",
+    )
+
+    # Clean up model env
+    torch._dynamo.reset()
diff --git a/tests/py/dynamo/models/test_models_export.py b/tests/py/dynamo/models/test_models_export.py
index 469ed569d1..6f96e259b0 100644
--- a/tests/py/dynamo/models/test_models_export.py
+++ b/tests/py/dynamo/models/test_models_export.py
@@ -249,6 +249,7 @@ def calibrate_loop(model):
 
 @unittest.skipIf(
     platform.system() != "Linux"
+    or torch.cuda.get_device_capability() < (8, 9)
     or not importlib.util.find_spec("modelopt")
     or Version(metadata.version("nvidia-modelopt")) < Version("0.17.0"),
     "modelopt 0.17.0 or later is required, Int8 quantization is supported in modelopt since 0.17.0 or later for linux",
@@ -257,7 +258,6 @@ def calibrate_loop(model):
 def test_base_int8(ir):
     import modelopt.torch.quantization as mtq
     from modelopt.torch.quantization.utils import export_torch_mode
-    from torch.export._trace import _export
 
     class SimpleNetwork(torch.nn.Module):
         def __init__(self):
@@ -285,7 +285,7 @@ def calibrate_loop(model):
 
     with torch.no_grad():
         with export_torch_mode():
-            exp_program = _export(model, (input_tensor,))
+            exp_program = torch.export.export(model, (input_tensor,))
             trt_model = torchtrt.dynamo.compile(
                 exp_program,
                 inputs=[input_tensor],
@@ -294,6 +294,7 @@ def calibrate_loop(model):
                 debug=True,
                 cache_built_engines=False,
                 reuse_cached_engines=False,
+                truncate_double=True,
             )
             outputs_trt = trt_model(input_tensor)
             assert torch.allclose(output_pyt, outputs_trt, rtol=5e-3, atol=1e-2)

From 3b177ac130af9e3d8961a21dd34f21543bf4ad44 Mon Sep 17 00:00:00 2001
From: Dheeraj Peri <peri.dheeraj@gmail.com>
Date: Tue, 1 Apr 2025 01:09:40 -0700
Subject: [PATCH 02/13] chore: reenable the test

---
 tests/py/dynamo/models/test_models_export.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/tests/py/dynamo/models/test_models_export.py b/tests/py/dynamo/models/test_models_export.py
index 6f96e259b0..f5230f3ace 100644
--- a/tests/py/dynamo/models/test_models_export.py
+++ b/tests/py/dynamo/models/test_models_export.py
@@ -249,7 +249,6 @@ def calibrate_loop(model):
 
 @unittest.skipIf(
     platform.system() != "Linux"
-    or torch.cuda.get_device_capability() < (8, 9)
     or not importlib.util.find_spec("modelopt")
     or Version(metadata.version("nvidia-modelopt")) < Version("0.17.0"),
     "modelopt 0.17.0 or later is required, Int8 quantization is supported in modelopt since 0.17.0 or later for linux",

From 112f67b761a853d321dc4eb1423ec73fb08218b8 Mon Sep 17 00:00:00 2001
From: Dheeraj Peri <peri.dheeraj@gmail.com>
Date: Tue, 1 Apr 2025 11:32:33 -0700
Subject: [PATCH 03/13] chore: separate the tests

---
 .github/workflows/build-test-linux.yml       |   1 +
 tests/py/dynamo/models/test_models_export.py | 105 -------------------
 2 files changed, 1 insertion(+), 105 deletions(-)

diff --git a/.github/workflows/build-test-linux.yml b/.github/workflows/build-test-linux.yml
index 024afd8c62..91b7f6633b 100644
--- a/.github/workflows/build-test-linux.yml
+++ b/.github/workflows/build-test-linux.yml
@@ -174,6 +174,7 @@ jobs:
         python -m pip install -r requirements.txt
         cd dynamo
         python -m pytest -ra --junitxml=${RUNNER_TEST_RESULTS_DIR}/dyn_models_export.xml --ir dynamo models/
+        python -m pytest -ra --junitxml=${RUNNER_TEST_RESULTS_DIR}/dyn_models_export.xml --ir dynamo test_modelopt_models.py
         popd
 
   tests-py-dynamo-serde:
diff --git a/tests/py/dynamo/models/test_models_export.py b/tests/py/dynamo/models/test_models_export.py
index f5230f3ace..9dfc7abebd 100644
--- a/tests/py/dynamo/models/test_models_export.py
+++ b/tests/py/dynamo/models/test_models_export.py
@@ -192,108 +192,3 @@ def test_resnet18_half(ir):
 
     # Clean up model env
     torch._dynamo.reset()
-
-
-@unittest.skipIf(
-    torch.cuda.get_device_capability() < (8, 9),
-    "FP8 quantization requires compute capability 8.9 or later",
-)
-@unittest.skipIf(
-    not importlib.util.find_spec("modelopt"),
-    "ModelOpt is required to run this test",
-)
-@pytest.mark.unit
-def test_base_fp8(ir):
-    import modelopt.torch.quantization as mtq
-    from modelopt.torch.quantization.utils import export_torch_mode
-
-    class SimpleNetwork(torch.nn.Module):
-        def __init__(self):
-            super(SimpleNetwork, self).__init__()
-            self.linear1 = torch.nn.Linear(in_features=10, out_features=5)
-            self.linear2 = torch.nn.Linear(in_features=5, out_features=1)
-
-        def forward(self, x):
-            x = self.linear1(x)
-            x = torch.nn.ReLU()(x)
-            x = self.linear2(x)
-            return x
-
-    def calibrate_loop(model):
-        """Simple calibration function for testing."""
-        model(input_tensor)
-
-    input_tensor = torch.randn(1, 10).cuda()
-    model = SimpleNetwork().eval().cuda()
-
-    quant_cfg = mtq.FP8_DEFAULT_CFG
-    mtq.quantize(model, quant_cfg, forward_loop=calibrate_loop)
-    # model has FP8 qdq nodes at this point
-    output_pyt = model(input_tensor)
-
-    with torch.no_grad():
-        with export_torch_mode():
-            exp_program = torch.export.export(model, (input_tensor,), strict=False)
-            trt_model = torchtrt.dynamo.compile(
-                exp_program,
-                inputs=[input_tensor],
-                enabled_precisions={torch.float8_e4m3fn},
-                min_block_size=1,
-                debug=True,
-                cache_built_engines=False,
-                reuse_cached_engines=False,
-            )
-            outputs_trt = trt_model(input_tensor)
-            assert torch.allclose(output_pyt, outputs_trt, rtol=5e-3, atol=1e-2)
-
-
-@unittest.skipIf(
-    platform.system() != "Linux"
-    or not importlib.util.find_spec("modelopt")
-    or Version(metadata.version("nvidia-modelopt")) < Version("0.17.0"),
-    "modelopt 0.17.0 or later is required, Int8 quantization is supported in modelopt since 0.17.0 or later for linux",
-)
-@pytest.mark.unit
-def test_base_int8(ir):
-    import modelopt.torch.quantization as mtq
-    from modelopt.torch.quantization.utils import export_torch_mode
-
-    class SimpleNetwork(torch.nn.Module):
-        def __init__(self):
-            super(SimpleNetwork, self).__init__()
-            self.linear1 = torch.nn.Linear(in_features=10, out_features=5)
-            self.linear2 = torch.nn.Linear(in_features=5, out_features=1)
-
-        def forward(self, x):
-            x = self.linear1(x)
-            x = torch.nn.ReLU()(x)
-            x = self.linear2(x)
-            return x
-
-    def calibrate_loop(model):
-        """Simple calibration function for testing."""
-        model(input_tensor)
-
-    input_tensor = torch.randn(1, 10).cuda()
-    model = SimpleNetwork().eval().cuda()
-
-    quant_cfg = mtq.INT8_DEFAULT_CFG
-    mtq.quantize(model, quant_cfg, forward_loop=calibrate_loop)
-    # model has INT8 qdq nodes at this point
-    output_pyt = model(input_tensor)
-
-    with torch.no_grad():
-        with export_torch_mode():
-            exp_program = torch.export.export(model, (input_tensor,))
-            trt_model = torchtrt.dynamo.compile(
-                exp_program,
-                inputs=[input_tensor],
-                enabled_precisions={torch.int8},
-                min_block_size=1,
-                debug=True,
-                cache_built_engines=False,
-                reuse_cached_engines=False,
-                truncate_double=True,
-            )
-            outputs_trt = trt_model(input_tensor)
-            assert torch.allclose(output_pyt, outputs_trt, rtol=5e-3, atol=1e-2)

From 736ff2520e741f9a0570ae4a9fbbcb29284cf3c2 Mon Sep 17 00:00:00 2001
From: Dheeraj Peri <peri.dheeraj@gmail.com>
Date: Tue, 1 Apr 2025 12:40:54 -0700
Subject: [PATCH 04/13] chore: minor fix

---
 .github/workflows/build-test-linux.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/build-test-linux.yml b/.github/workflows/build-test-linux.yml
index 91b7f6633b..7c92573f4e 100644
--- a/.github/workflows/build-test-linux.yml
+++ b/.github/workflows/build-test-linux.yml
@@ -174,7 +174,7 @@ jobs:
         python -m pip install -r requirements.txt
         cd dynamo
         python -m pytest -ra --junitxml=${RUNNER_TEST_RESULTS_DIR}/dyn_models_export.xml --ir dynamo models/
-        python -m pytest -ra --junitxml=${RUNNER_TEST_RESULTS_DIR}/dyn_models_export.xml --ir dynamo test_modelopt_models.py
+        python -m pytest -ra --junitxml=${RUNNER_TEST_RESULTS_DIR}/dyn_models_export.xml test_modelopt_models.py
         popd
 
   tests-py-dynamo-serde:

From b37add3aa3e62b7ec16c0e7419a1fd9b03db26b5 Mon Sep 17 00:00:00 2001
From: Dheeraj Peri <peri.dheeraj@gmail.com>
Date: Tue, 8 Apr 2025 15:11:16 -0700
Subject: [PATCH 05/13] chore: updates

---
 tests/py/dynamo/test_modelopt_models.py | 118 ++++++++++++++++++++++++
 1 file changed, 118 insertions(+)
 create mode 100644 tests/py/dynamo/test_modelopt_models.py

diff --git a/tests/py/dynamo/test_modelopt_models.py b/tests/py/dynamo/test_modelopt_models.py
new file mode 100644
index 0000000000..2593d03e74
--- /dev/null
+++ b/tests/py/dynamo/test_modelopt_models.py
@@ -0,0 +1,118 @@
+# type: ignore
+import importlib
+import platform
+import unittest
+from importlib import metadata
+
+import pytest
+import torch
+import torch_tensorrt as torchtrt
+
+from packaging.version import Version
+
+assertions = unittest.TestCase()
+
+
+@unittest.skipIf(
+    torch.cuda.get_device_capability() < (8, 9),
+    "FP8 quantization requires compute capability 8.9 or later",
+)
+@unittest.skipIf(
+    not importlib.util.find_spec("modelopt"),
+    "ModelOpt is required to run this test",
+)
+@pytest.mark.unit
+def test_base_fp8(ir):
+    import modelopt.torch.quantization as mtq
+    from modelopt.torch.quantization.utils import export_torch_mode
+
+    class SimpleNetwork(torch.nn.Module):
+        def __init__(self):
+            super(SimpleNetwork, self).__init__()
+            self.linear1 = torch.nn.Linear(in_features=10, out_features=5)
+            self.linear2 = torch.nn.Linear(in_features=5, out_features=1)
+
+        def forward(self, x):
+            x = self.linear1(x)
+            x = torch.nn.ReLU()(x)
+            x = self.linear2(x)
+            return x
+
+    def calibrate_loop(model):
+        """Simple calibration function for testing."""
+        model(input_tensor)
+
+    input_tensor = torch.randn(1, 10).cuda()
+    model = SimpleNetwork().eval().cuda()
+
+    quant_cfg = mtq.FP8_DEFAULT_CFG
+    mtq.quantize(model, quant_cfg, forward_loop=calibrate_loop)
+    # model has FP8 qdq nodes at this point
+    output_pyt = model(input_tensor)
+
+    with torch.no_grad():
+        with export_torch_mode():
+            exp_program = torch.export.export(model, (input_tensor,), strict=False)
+            trt_model = torchtrt.dynamo.compile(
+                exp_program,
+                inputs=[input_tensor],
+                enabled_precisions={torch.float8_e4m3fn},
+                min_block_size=1,
+                debug=True,
+                cache_built_engines=False,
+                reuse_cached_engines=False,
+            )
+            outputs_trt = trt_model(input_tensor)
+            assert torch.allclose(output_pyt, outputs_trt, rtol=5e-3, atol=1e-2)
+
+
+@unittest.skipIf(
+    platform.system() != "Linux"
+    or not importlib.util.find_spec("modelopt")
+    or Version(metadata.version("nvidia-modelopt")) < Version("0.17.0"),
+    "modelopt 0.17.0 or later is required, Int8 quantization is supported in modelopt since 0.17.0 or later for linux",
+)
+@pytest.mark.unit
+def test_base_int8(ir):
+    import modelopt.torch.quantization as mtq
+    from modelopt.torch.quantization.utils import export_torch_mode
+
+    class SimpleNetwork(torch.nn.Module):
+        def __init__(self):
+            super(SimpleNetwork, self).__init__()
+            self.linear1 = torch.nn.Linear(in_features=10, out_features=5)
+            self.linear2 = torch.nn.Linear(in_features=5, out_features=1)
+
+        def forward(self, x):
+            x = self.linear1(x)
+            x = torch.nn.ReLU()(x)
+            x = self.linear2(x)
+            return x
+
+    def calibrate_loop(model):
+        """Simple calibration function for testing."""
+        model(input_tensor)
+
+    input_tensor = torch.randn(1, 10).cuda()
+    model = SimpleNetwork().eval().cuda()
+
+    quant_cfg = mtq.INT8_DEFAULT_CFG
+    mtq.quantize(model, quant_cfg, forward_loop=calibrate_loop)
+    # model has INT8 qdq nodes at this point
+    output_pyt = model(input_tensor)
+
+    with torch.no_grad():
+        with export_torch_mode():
+            exp_program = torch.export.export(model, (input_tensor,))
+            trt_model = torchtrt.dynamo.compile(
+                exp_program,
+                inputs=[input_tensor],
+                enabled_precisions={torch.int8},
+                min_block_size=1,
+                debug=True,
+                cache_built_engines=False,
+                reuse_cached_engines=False,
+                truncate_double=True,
+            )
+            outputs_trt = trt_model(input_tensor)
+            assert torch.allclose(output_pyt, outputs_trt, rtol=5e-3, atol=1e-2)

From 133d3e27e9805f2bde326dfd91d0229768fffd61 Mon Sep 17 00:00:00 2001
From: Dheeraj Peri <peri.dheeraj@gmail.com>
Date: Wed, 9 Apr 2025 15:07:42 -0700
Subject: [PATCH 06/13] chore: updates

---
 tests/py/dynamo/test_modelopt_models.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tests/py/dynamo/test_modelopt_models.py b/tests/py/dynamo/test_modelopt_models.py
index 2593d03e74..50ccd2880d 100644
--- a/tests/py/dynamo/test_modelopt_models.py
+++ b/tests/py/dynamo/test_modelopt_models.py
@@ -22,7 +22,7 @@
     "ModelOpt is required to run this test",
 )
 @pytest.mark.unit
-def test_base_fp8(ir):
+def test_base_fp8():
     import modelopt.torch.quantization as mtq
     from modelopt.torch.quantization.utils import export_torch_mode
 
@@ -73,7 +73,7 @@ def calibrate_loop(model):
     "modelopt 0.17.0 or later is required, Int8 quantization is supported in modelopt since 0.17.0 or later for linux",
 )
 @pytest.mark.unit
-def test_base_int8(ir):
+def test_base_int8():
     import modelopt.torch.quantization as mtq
     from modelopt.torch.quantization.utils import export_torch_mode
 

From 1835fa3ee3a03b2dedbb15bdd2ab3fce74966ea9 Mon Sep 17 00:00:00 2001
From: Dheeraj Peri <peri.dheeraj@gmail.com>
Date: Thu, 17 Apr 2025 16:54:00 -0700
Subject: [PATCH 07/13] chore: bug fix

---
 .../dynamo/conversion/converter_utils.py      | 63 ++++++++++---------
 1 file changed, 33 insertions(+), 30 deletions(-)

diff --git a/py/torch_tensorrt/dynamo/conversion/converter_utils.py b/py/torch_tensorrt/dynamo/conversion/converter_utils.py
index bcb8495c67..375eed65a2 100644
--- a/py/torch_tensorrt/dynamo/conversion/converter_utils.py
+++ b/py/torch_tensorrt/dynamo/conversion/converter_utils.py
@@ -585,42 +585,45 @@ def to_numpy(
     Returns:
         A Numpy array or None, if the input was None.
     """
-    output = None
+    with unset_fake_temporarily():
+        output = None
 
-    if value is None or isinstance(value, np.ndarray):
-        output = value
+        if value is None or isinstance(value, np.ndarray):
+            output = value
 
-    elif isinstance(value, torch.Tensor):
-        if value.is_quantized:
-            value = value.dequantize()
-        elif value.dtype == torch.bfloat16:
-            # TODO: Remove when numpy has a BF16 type
-            _LOGGER.warning(
-                "Requested a conversion of bfloat16 tensor from torch to numpy which isn't supported. Casting this tensor to FP32 precision currently. Please use to_torch() API for better data representation",
-            )
-            value = value.to(torch.float)
-
-        output = value.cpu().detach().contiguous().numpy()
+        elif isinstance(value, torch.Tensor):
+            if value.is_quantized:
+                value = value.dequantize()
+            elif value.dtype == torch.bfloat16:
+                # TODO: Remove when numpy has a BF16 type
+                _LOGGER.warning(
+                    "Requested a conversion of bfloat16 tensor from torch to numpy which isn't supported. Casting this tensor to FP32 precision currently. Please use to_torch() API for better data representation",
+                )
+                value = value.to(torch.float)
 
-    elif isinstance(value, int):
-        output = np.array([value], dtype=np.int32)
+            output = value.cpu().detach().contiguous().numpy()
 
-    elif isinstance(value, float):
-        output = np.array([value], dtype=np.float32)
+        elif isinstance(value, int):
+            output = np.array([value], dtype=np.int32)
 
-    elif isinstance(value, bool):
-        output = np.array([value], dtype=np.bool_)
+        elif isinstance(value, float):
+            output = np.array([value], dtype=np.float32)
 
-    if isinstance(output, np.ndarray) or output is None:
-        return (
-            output
-            if (dtype is None or output is None)
-            else output.astype(_enums.dtype._from(dtype).to(np.dtype, use_default=True))
-        )
-    else:
-        raise AssertionError(
-            f"to_numpy can only be called on None, bool, int, float, np.ndarray, or torch.Tensor, got: {value}"
-        )
+        elif isinstance(value, bool):
+            output = np.array([value], dtype=np.bool_)
+
+        if isinstance(output, np.ndarray) or output is None:
+            return (
+                output
+                if (dtype is None or output is None)
+                else output.astype(
+                    _enums.dtype._from(dtype).to(np.dtype, use_default=True)
+                )
+            )
+        else:
+            raise AssertionError(
+                f"to_numpy can only be called on None, bool, int, float, np.ndarray, or torch.Tensor, got: {value}"
+            )
 
 
 def to_torch(

From c57aa0491f8b80b144ad33fa158c98f590ef8398 Mon Sep 17 00:00:00 2001
From: Dheeraj Peri <peri.dheeraj@gmail.com>
Date: Tue, 22 Apr 2025 10:30:42 -0700
Subject: [PATCH 08/13] chore: bug fixes for refit tests, restructure CI tests

---
 .github/workflows/build-test-linux.yml        |   9 +-
 py/torch_tensorrt/dynamo/_refit.py            |  66 ++----
 .../{ => models}/test_modelopt_models.py      |   0
 tests/py/dynamo/models/test_models_export.py  | 199 ------------------
 4 files changed, 30 insertions(+), 244 deletions(-)
 rename tests/py/dynamo/{ => models}/test_modelopt_models.py (100%)
 delete mode 100644 tests/py/dynamo/models/test_models_export.py

diff --git a/.github/workflows/build-test-linux.yml b/.github/workflows/build-test-linux.yml
index 300dbd9d0d..9efb2ab146 100644
--- a/.github/workflows/build-test-linux.yml
+++ b/.github/workflows/build-test-linux.yml
@@ -172,8 +172,12 @@ jobs:
         cd tests/py
         python -m pip install -r requirements.txt
         cd dynamo
-        python -m pytest -ra --junitxml=${RUNNER_TEST_RESULTS_DIR}/dyn_models_export.xml --ir dynamo models/
-        python -m pytest -ra --junitxml=${RUNNER_TEST_RESULTS_DIR}/dyn_models_export.xml test_modelopt_models.py
+        python -m pytest -ra --junitxml=${RUNNER_TEST_RESULTS_DIR}/dynamo_models.xml --ir dynamo models/test_models.py
+        python -m pytest -ra --junitxml=${RUNNER_TEST_RESULTS_DIR}/dynamo_models_dynamic.xml --ir dynamo models/test_dyn_models.py
+        python -m pytest -ra --junitxml=${RUNNER_TEST_RESULTS_DIR}/engine_cache.xml --ir dynamo models/test_engine_cache.py
+        python -m pytest -ra --junitxml=${RUNNER_TEST_RESULTS_DIR}/model_refit.xml --ir dynamo models/test_model_refit.py
+        python -m pytest -ra --junitxml=${RUNNER_TEST_RESULTS_DIR}/modelopt_models.xml --ir dynamo models/test_modelopt_models.py
+        python -m pytest -ra --junitxml=${RUNNER_TEST_RESULTS_DIR}/weight_stripped_engine.xml --ir dynamo models/test_weight_stripped_engine.py
         popd
 
   tests-py-dynamo-serde:
@@ -206,6 +210,7 @@ jobs:
         cd dynamo
         python -m pytest -ra --junitxml=${RUNNER_TEST_RESULTS_DIR}/export_serde_test_results.xml --ir dynamo models/test_export_serde.py
         python -m pytest -ra --junitxml=${RUNNER_TEST_RESULTS_DIR}/reexport_test_results.xml --ir dynamo models/test_reexport.py
+        python -m pytest -ra --junitxml=${RUNNER_TEST_RESULTS_DIR}/export_kwargs_serde_test_results.xml --ir dynamo models/test_export_kwargs_serde.py
         popd
 
   tests-py-torch-compile-be:
diff --git a/py/torch_tensorrt/dynamo/_refit.py b/py/torch_tensorrt/dynamo/_refit.py
index 1082222aab..f1215730a9 100644
--- a/py/torch_tensorrt/dynamo/_refit.py
+++ b/py/torch_tensorrt/dynamo/_refit.py
@@ -48,7 +48,7 @@
 logger = logging.getLogger(__name__)
 
 
-@needs_refit
+@needs_refit  # type: ignore
 def construct_refit_mapping(
     module: torch.fx.GraphModule,
     inputs: Sequence[Input],
@@ -110,7 +110,7 @@ def construct_refit_mapping(
     return weight_map
 
 
-@needs_refit
+@needs_refit  # type: ignore
 def construct_refit_mapping_from_weight_name_map(
     weight_name_map: dict[Any, Any],
     state_dict: dict[Any, Any],
@@ -141,7 +141,7 @@ def construct_refit_mapping_from_weight_name_map(
     return engine_weight_map
 
 
-@needs_refit
+@needs_refit  # type: ignore
 def _refit_single_trt_engine_with_gm(
     new_gm: torch.fx.GraphModule,
     old_engine: trt.ICudaEngine,
@@ -153,12 +153,12 @@ def _refit_single_trt_engine_with_gm(
     Refit a TensorRT Engine in place
     """
 
-    with unset_fake_temporarily():
-        refitted = set()
-        torch_device = get_model_device(new_gm)
-        refitter = trt.Refitter(old_engine, TRT_LOGGER)
-        weight_list = refitter.get_all_weights()
+    refitted = set()
+    torch_device = get_model_device(new_gm)
+    refitter = trt.Refitter(old_engine, TRT_LOGGER)
+    weight_list = refitter.get_all_weights()
 
+    with unset_fake_temporarily():
         if weight_name_map:
             # Get the refitting mapping
             trt_wt_location = (
@@ -185,41 +185,21 @@ def _refit_single_trt_engine_with_gm(
                     trt_dtype,
                 )
 
-                constant_mapping: dict[str, Any] = weight_name_map.pop(
-                    "constant_mapping", {}
-                )  # type: ignore
-                mapping = construct_refit_mapping_from_weight_name_map(
-                    weight_name_map, new_gm.state_dict()
-                )
-                constant_mapping_with_type = {}
-
-                for constant_name, val in constant_mapping.items():
-                    np_weight_type = val.dtype
-                    val_tensor = torch.from_numpy(val).cuda()
-                    trt_dtype = dtype.try_from(np_weight_type).to(trt.DataType)
-                    torch_dtype = dtype.try_from(np_weight_type).to(torch.dtype)
-                    constant_mapping_with_type[constant_name] = (
-                        val_tensor.clone().reshape(-1).contiguous().to(torch_dtype),
-                        trt_dtype,
-                    )
+            mapping.update(constant_mapping_with_type)
 
-                mapping.update(constant_mapping_with_type)
-
-                for layer_name in weight_list:
-                    if layer_name not in mapping:
-                        logger.warning(f"{layer_name} is not found in weight mapping.")
-                        continue
-                    # Use Numpy to create weights
-                    weight, weight_dtype = mapping[layer_name]
-                    trt_wt_tensor = trt.Weights(
-                        weight_dtype, weight.data_ptr(), torch.numel(weight)
-                    )
-                    refitter.set_named_weights(
-                        layer_name, trt_wt_tensor, trt_wt_location
-                    )
-                assert (
-                    len(refitter.get_missing_weights()) == 0
-                ), "Fast refitting failed due to incomplete mapping"
+            for layer_name in weight_list:
+                if layer_name not in mapping:
+                    logger.warning(f"{layer_name} is not found in weight mapping.")
+                    continue
+                # Use Numpy to create weights
+                weight, weight_dtype = mapping[layer_name]
+                trt_wt_tensor = trt.Weights(
+                    weight_dtype, weight.data_ptr(), torch.numel(weight)
+                )
+                refitter.set_named_weights(layer_name, trt_wt_tensor, trt_wt_location)
+            assert (
+                len(refitter.get_missing_weights()) == 0
+            ), "Fast refitting failed due to incomplete mapping"
 
         else:
             mapping = construct_refit_mapping(new_gm, input_list, settings)
@@ -241,7 +221,7 @@ def _refit_single_trt_engine_with_gm(
             raise AssertionError("Refitting failed.")
 
 
-@needs_refit
+@needs_refit  # type: ignore
 def refit_module_weights(
     compiled_module: torch.fx.GraphModule | ExportedProgram,
     new_weight_module: ExportedProgram,
diff --git a/tests/py/dynamo/test_modelopt_models.py b/tests/py/dynamo/models/test_modelopt_models.py
similarity index 100%
rename from tests/py/dynamo/test_modelopt_models.py
rename to tests/py/dynamo/models/test_modelopt_models.py
diff --git a/tests/py/dynamo/models/test_models_export.py b/tests/py/dynamo/models/test_models_export.py
deleted file mode 100644
index 005594d62e..0000000000
--- a/tests/py/dynamo/models/test_models_export.py
+++ /dev/null
@@ -1,199 +0,0 @@
-# type: ignore
-import importlib
-import platform
-import unittest
-from importlib import metadata
-
-import pytest
-import timm
-import torch
-import torch_tensorrt as torchtrt
-import torchvision.models as models
-from torch_tensorrt.dynamo.utils import COSINE_THRESHOLD, cosine_similarity
-
-from packaging.version import Version
-
-assertions = unittest.TestCase()
-
-
-@pytest.mark.unit
-def test_resnet18(ir):
-    model = models.resnet18(pretrained=True).eval().to("cuda")
-    input = torch.randn((1, 3, 224, 224)).to("cuda")
-
-    compile_spec = {
-        "inputs": [
-            torchtrt.Input(
-                input.shape, dtype=torch.float, format=torch.contiguous_format
-            )
-        ],
-        "device": torchtrt.Device("cuda:0"),
-        "enabled_precisions": {torch.float},
-        "ir": ir,
-        "pass_through_build_failures": True,
-        "optimization_level": 1,
-        "min_block_size": 8,
-        "cache_built_engines": False,
-        "reuse_cached_engines": False,
-    }
-
-    trt_mod = torchtrt.compile(model, **compile_spec)
-    cos_sim = cosine_similarity(model(input), trt_mod(input)[0])
-    assertions.assertTrue(
-        cos_sim > COSINE_THRESHOLD,
-        msg=f"Resnet18 TRT outputs don't match with the original model. Cosine sim score: {cos_sim} Threshold: {COSINE_THRESHOLD}",
-    )
-
-    # Clean up model env
-    torch._dynamo.reset()
-
-
-@pytest.mark.unit
-def test_mobilenet_v2(ir):
-    model = models.mobilenet_v2(pretrained=True).eval().to("cuda")
-    input = torch.randn((1, 3, 224, 224)).to("cuda")
-
-    compile_spec = {
-        "inputs": [
-            torchtrt.Input(
-                input.shape, dtype=torch.float, format=torch.contiguous_format
-            )
-        ],
-        "device": torchtrt.Device("cuda:0"),
-        "enabled_precisions": {torch.float},
-        "ir": ir,
-        "pass_through_build_failures": True,
-        "optimization_level": 1,
-        "min_block_size": 8,
-        "cache_built_engines": False,
-        "reuse_cached_engines": False,
-    }
-
-    trt_mod = torchtrt.compile(model, **compile_spec)
-    cos_sim = cosine_similarity(model(input), trt_mod(input)[0])
-    assertions.assertTrue(
-        cos_sim > COSINE_THRESHOLD,
-        msg=f"Mobilenet v2 TRT outputs don't match with the original model. Cosine sim score: {cos_sim} Threshold: {COSINE_THRESHOLD}",
-    )
-
-    # Clean up model env
-    torch._dynamo.reset()
-
-
-@pytest.mark.unit
-def test_efficientnet_b0(ir):
-    model = timm.create_model("efficientnet_b0", pretrained=True).eval().to("cuda")
-    input = torch.randn((1, 3, 224, 224)).to("cuda")
-
-    compile_spec = {
-        "inputs": [
-            torchtrt.Input(
-                input.shape, dtype=torch.float, format=torch.contiguous_format
-            )
-        ],
-        "device": torchtrt.Device("cuda:0"),
-        "enabled_precisions": {torch.float},
-        "ir": ir,
-        "pass_through_build_failures": True,
-        "optimization_level": 1,
-        "min_block_size": 8,
-        "cache_built_engines": False,
-        "reuse_cached_engines": False,
-    }
-
-    trt_mod = torchtrt.compile(model, **compile_spec)
-    cos_sim = cosine_similarity(model(input), trt_mod(input)[0])
-    assertions.assertTrue(
-        cos_sim > COSINE_THRESHOLD,
-        msg=f"EfficientNet-B0 TRT outputs don't match with the original model. Cosine sim score: {cos_sim} Threshold: {COSINE_THRESHOLD}",
-    )
-
-    # Clean up model env
-    torch._dynamo.reset()
-
-
-@pytest.mark.unit
-@unittest.skipIf(
-    not importlib.util.find_spec("transformers"),
-    "transformers is required to run this test",
-)
-def test_bert_base_uncased(ir):
-    from transformers import BertModel
-
-    model = (
-        BertModel.from_pretrained("bert-base-uncased", return_dict=False).cuda().eval()
-    )
-    input = torch.randint(0, 2, (1, 14), dtype=torch.int32).to("cuda")
-    input2 = torch.randint(0, 2, (1, 14), dtype=torch.int32).to("cuda")
-
-    compile_spec = {
-        "inputs": [
-            torchtrt.Input(
-                input.shape,
-                dtype=input.dtype,
-                format=torch.contiguous_format,
-            ),
-            torchtrt.Input(
-                input.shape,
-                dtype=input.dtype,
-                format=torch.contiguous_format,
-            ),
-        ],
-        "device": torchtrt.Device("cuda:0"),
-        "enabled_precisions": {torch.float},
-        "truncate_double": True,
-        "ir": ir,
-        "min_block_size": 10,
-        "cache_built_engines": False,
-        "reuse_cached_engines": False,
-    }
-    trt_mod = torchtrt.compile(model, **compile_spec)
-    model_outputs = model(input, input2)
-    trt_model_outputs = trt_mod(input, input2)
-    assertions.assertTrue(
-        len(model_outputs) == len(trt_model_outputs),
-        msg=f"Number of outputs for BERT model compilation is different with Pytorch {len(model_outputs)} and TensorRT {len(trt_model_outputs)}. Please check the compilation.",
-    )
-
-    for index in range(len(model_outputs)):
-        out, trt_out = model_outputs[index], trt_model_outputs[index]
-        cos_sim = cosine_similarity(out, trt_out)
-        assertions.assertTrue(
-            cos_sim > COSINE_THRESHOLD,
-            msg=f"HF BERT base-uncased TRT outputs don't match with the original model. Cosine sim score: {cos_sim} Threshold: {COSINE_THRESHOLD}",
-        )
-
-    # Clean up model env
-    torch._dynamo.reset()
-
-
-@pytest.mark.unit
-def test_resnet18_half(ir):
-    model = models.resnet18(pretrained=True).eval().to("cuda").half()
-    input = torch.randn((1, 3, 224, 224)).to("cuda").half()
-
-    compile_spec = {
-        "inputs": [
-            torchtrt.Input(
-                input.shape, dtype=torch.half, format=torch.contiguous_format
-            )
-        ],
-        "device": torchtrt.Device("cuda:0"),
-        "enabled_precisions": {torch.half},
-        "ir": ir,
-        "pass_through_build_failures": True,
-        "optimization_level": 1,
-        "min_block_size": 8,
-        "cache_built_engines": False,
-        "reuse_cached_engines": False,
-    }
-
-    trt_mod = torchtrt.compile(model, **compile_spec)
-    cos_sim = cosine_similarity(model(input), trt_mod(input)[0])
-    assertions.assertTrue(
-        cos_sim > COSINE_THRESHOLD,
-        msg=f"Resnet18 Half TRT outputs don't match with the original model. Cosine sim score: {cos_sim} Threshold: {COSINE_THRESHOLD}",
-    )
-
-    # Clean up model env
-    torch._dynamo.reset()

From 574b6bd820158b7a6b7bce053b0df9f07a02135a Mon Sep 17 00:00:00 2001
From: Dheeraj Peri <peri.dheeraj@gmail.com>
Date: Tue, 22 Apr 2025 10:36:48 -0700
Subject: [PATCH 09/13] chore: remove debug logging in CI tests, add missing
 dtype_support test

---
 .github/workflows/build-test-linux.yml            | 1 +
 tests/py/dynamo/backend/test_backend_compiler.py  | 8 +-------
 tests/py/dynamo/conversion/harness.py             | 2 --
 tests/py/dynamo/models/test_dtype_support.py      | 1 -
 tests/py/dynamo/models/test_model_refit.py        | 1 -
 tests/py/dynamo/models/test_modelopt_models.py    | 2 --
 tests/py/dynamo/runtime/test_002_cudagraphs_py.py | 1 -
 7 files changed, 2 insertions(+), 14 deletions(-)

diff --git a/.github/workflows/build-test-linux.yml b/.github/workflows/build-test-linux.yml
index 9efb2ab146..e740fb1e49 100644
--- a/.github/workflows/build-test-linux.yml
+++ b/.github/workflows/build-test-linux.yml
@@ -175,6 +175,7 @@ jobs:
         python -m pytest -ra --junitxml=${RUNNER_TEST_RESULTS_DIR}/dynamo_models.xml --ir dynamo models/test_models.py
         python -m pytest -ra --junitxml=${RUNNER_TEST_RESULTS_DIR}/dynamo_models_dynamic.xml --ir dynamo models/test_dyn_models.py
         python -m pytest -ra --junitxml=${RUNNER_TEST_RESULTS_DIR}/engine_cache.xml --ir dynamo models/test_engine_cache.py
+        python -m pytest -ra --junitxml=${RUNNER_TEST_RESULTS_DIR}/dtype_support.xml --ir dynamo models/test_dtype_support.py
         python -m pytest -ra --junitxml=${RUNNER_TEST_RESULTS_DIR}/model_refit.xml --ir dynamo models/test_model_refit.py
         python -m pytest -ra --junitxml=${RUNNER_TEST_RESULTS_DIR}/modelopt_models.xml --ir dynamo models/test_modelopt_models.py
         python -m pytest -ra --junitxml=${RUNNER_TEST_RESULTS_DIR}/weight_stripped_engine.xml --ir dynamo models/test_weight_stripped_engine.py
diff --git a/tests/py/dynamo/backend/test_backend_compiler.py b/tests/py/dynamo/backend/test_backend_compiler.py
index 4c65800f05..6369d3805c 100644
--- a/tests/py/dynamo/backend/test_backend_compiler.py
+++ b/tests/py/dynamo/backend/test_backend_compiler.py
@@ -2,11 +2,10 @@
 from copy import deepcopy
 
 import torch
+import torch_tensorrt
 from torch.testing._internal.common_utils import TestCase, run_tests
 from torch_tensorrt.dynamo.partitioning import fast_partition
 
-import torch_tensorrt
-
 from ..testing_utilities import DECIMALS_OF_AGREEMENT, lower_graph_testing
 
 
@@ -51,7 +50,6 @@ def forward(self, x, y):
             pass_through_build_failures=True,
             torch_executed_ops={"torch.ops.aten.add.Tensor"},
             use_python_runtime=False,
-            debug=True,
         )
         optimized_model_results = optimized_model(*inputs).detach().cpu()
         torch_model_results = fx_graph(*inputs).detach().cpu()
@@ -132,7 +130,6 @@ def forward(self, x, y):
             pass_through_build_failures=True,
             torch_executed_ops={"torch.ops.aten.add.Tensor"},
             use_python_runtime=False,
-            debug=True,
         )
         optimized_model_results = optimized_model(*inputs).detach().cpu()
         torch_model_results = model(*inputs).detach().cpu()
@@ -177,7 +174,6 @@ def forward(self, x, y):
             optimization_level=4,
             version_compatible=True,
             max_aux_streams=5,
-            debug=True,
         )
         optimized_model_results = optimized_model(*inputs).detach().cpu()
         torch_model_results = fx_graph(*inputs).detach().cpu()
@@ -225,7 +221,6 @@ def forward(self, x, y):
             min_block_size=1,
             pass_through_build_failures=True,
             truncate_double=True,
-            debug=True,
         )
         optimized_model_results = optimized_model(*inputs).detach().cpu()
         torch_model_results = fx_graph(*inputs).detach().cpu()
@@ -298,7 +293,6 @@ def forward(self, x, y):
             min_block_size=1,
             pass_through_build_failures=True,
             truncate_double=False,
-            debug=True,
             torch_executed_ops={"torch.ops.aten.add.Tensor"},
         )
         optimized_model_results = optimized_model(*inputs).detach().cpu()
diff --git a/tests/py/dynamo/conversion/harness.py b/tests/py/dynamo/conversion/harness.py
index 6ff45507a0..aa22a74fc0 100644
--- a/tests/py/dynamo/conversion/harness.py
+++ b/tests/py/dynamo/conversion/harness.py
@@ -415,7 +415,6 @@ def run_test(
         compilation_settings = CompilationSettings(
             enabled_precisions={dtype._from(precision)},
             truncate_double=True,
-            debug=True,
             immutable_weights=immutable_weights,
         )
 
@@ -507,7 +506,6 @@ def run_test_compare_tensor_attributes_only(
         compilation_settings = CompilationSettings(
             enabled_precisions={dtype._from(precision)},
             truncate_double=True,
-            debug=True,
             immutable_weights=immutable_weights,
         )
 
diff --git a/tests/py/dynamo/models/test_dtype_support.py b/tests/py/dynamo/models/test_dtype_support.py
index 146f7fdb7d..37b40574a1 100644
--- a/tests/py/dynamo/models/test_dtype_support.py
+++ b/tests/py/dynamo/models/test_dtype_support.py
@@ -297,7 +297,6 @@ def forward(self, x):
                 ir="torch_compile",
                 inputs=inputs,
                 enabled_precisions={torch.bfloat16},
-                debug=True,
                 min_block_size=1,
                 device=device,
                 cache_built_engines=False,
diff --git a/tests/py/dynamo/models/test_model_refit.py b/tests/py/dynamo/models/test_model_refit.py
index d71091b04e..b170bcc47d 100644
--- a/tests/py/dynamo/models/test_model_refit.py
+++ b/tests/py/dynamo/models/test_model_refit.py
@@ -815,7 +815,6 @@ def forward(self, x):
             exp_program,
             tuple(inputs),
             enabled_precisions={torch.float},
-            debug=True,
             min_block_size=1,
             immutable_weights=False,
         )
diff --git a/tests/py/dynamo/models/test_modelopt_models.py b/tests/py/dynamo/models/test_modelopt_models.py
index 50ccd2880d..51a74b022a 100644
--- a/tests/py/dynamo/models/test_modelopt_models.py
+++ b/tests/py/dynamo/models/test_modelopt_models.py
@@ -58,7 +58,6 @@ def calibrate_loop(model):
                 inputs=[input_tensor],
                 enabled_precisions={torch.float8_e4m3fn},
                 min_block_size=1,
-                debug=True,
                 cache_built_engines=False,
                 reuse_cached_engines=False,
             )
@@ -109,7 +108,6 @@ def calibrate_loop(model):
                 inputs=[input_tensor],
                 enabled_precisions={torch.int8},
                 min_block_size=1,
-                debug=True,
                 cache_built_engines=False,
                 reuse_cached_engines=False,
                 truncate_double=True,
diff --git a/tests/py/dynamo/runtime/test_002_cudagraphs_py.py b/tests/py/dynamo/runtime/test_002_cudagraphs_py.py
index 0a4629644d..0c9b8bc13f 100644
--- a/tests/py/dynamo/runtime/test_002_cudagraphs_py.py
+++ b/tests/py/dynamo/runtime/test_002_cudagraphs_py.py
@@ -61,7 +61,6 @@ def forward(self, x):
             min_block_size=1,
             pass_through_build_failures=True,
             use_python_runtime=True,
-            debug=True,
         )
 
         result_samples = []

From 9f32b3d7f1cf7929f7e87a7bfc693c4c125bc003 Mon Sep 17 00:00:00 2001
From: Dheeraj Peri <peri.dheeraj@gmail.com>
Date: Tue, 22 Apr 2025 11:57:16 -0700
Subject: [PATCH 10/13] chore: updates

---
 py/torch_tensorrt/dynamo/conversion/impl/quantize.py | 2 ++
 tests/py/dynamo/models/test_modelopt_models.py       | 3 ++-
 2 files changed, 4 insertions(+), 1 deletion(-)

diff --git a/py/torch_tensorrt/dynamo/conversion/impl/quantize.py b/py/torch_tensorrt/dynamo/conversion/impl/quantize.py
index e472ed3092..c23fd55c5a 100644
--- a/py/torch_tensorrt/dynamo/conversion/impl/quantize.py
+++ b/py/torch_tensorrt/dynamo/conversion/impl/quantize.py
@@ -45,7 +45,9 @@ def quantize(
             max_bound = 448
 
         amax = to_torch(amax, None)
+        print(f"=========== AMAX: {amax}")
         scale = torch.divide(amax, max_bound)
+        print(f"=========== SCALE: {scale}")
         scale = get_trt_tensor(ctx, scale, name + "_scale")
         # Add Q node
         quantize_layer = ctx.net.add_quantize(input_tensor, scale)
diff --git a/tests/py/dynamo/models/test_modelopt_models.py b/tests/py/dynamo/models/test_modelopt_models.py
index 51a74b022a..36dce5af32 100644
--- a/tests/py/dynamo/models/test_modelopt_models.py
+++ b/tests/py/dynamo/models/test_modelopt_models.py
@@ -100,7 +100,7 @@ def calibrate_loop(model):
     # model has INT8 qdq nodes at this point
     output_pyt = model(input_tensor)
 
-    with torch.no_grad():
+    with torchtrt.logging.debug(), torch.no_grad():
         with export_torch_mode():
             exp_program = torch.export.export(model, (input_tensor,))
             trt_model = torchtrt.dynamo.compile(
@@ -111,6 +111,7 @@ def calibrate_loop(model):
                 cache_built_engines=False,
                 reuse_cached_engines=False,
                 truncate_double=True,
+                debug=True,
             )
             outputs_trt = trt_model(input_tensor)
             assert torch.allclose(output_pyt, outputs_trt, rtol=5e-3, atol=1e-2)

From 8c97e337b8e4fdbf528aa003f57bfb6b85c341dc Mon Sep 17 00:00:00 2001
From: Dheeraj Peri <peri.dheeraj@gmail.com>
Date: Tue, 22 Apr 2025 14:42:44 -0700
Subject: [PATCH 11/13] chore: bug fixes

---
 py/torch_tensorrt/dynamo/conversion/_TRTInterpreter.py | 2 +-
 py/torch_tensorrt/dynamo/conversion/impl/quantize.py   | 2 --
 tests/py/dynamo/models/test_modelopt_models.py         | 4 ++--
 tests/py/requirements.txt                              | 2 +-
 4 files changed, 4 insertions(+), 6 deletions(-)

diff --git a/py/torch_tensorrt/dynamo/conversion/_TRTInterpreter.py b/py/torch_tensorrt/dynamo/conversion/_TRTInterpreter.py
index fde07bf1f5..5bd3efd35c 100644
--- a/py/torch_tensorrt/dynamo/conversion/_TRTInterpreter.py
+++ b/py/torch_tensorrt/dynamo/conversion/_TRTInterpreter.py
@@ -898,7 +898,7 @@ def get_attr(self, target: str, args: Any, kwargs: Any) -> np.ndarray:
             else:
                 constant_tensor = frozen_attr
 
-        return to_torch(constant_tensor)
+            return to_torch(constant_tensor)
 
     def call_method(self, target: str, args: Any, kwargs: Any) -> Any:
         assert isinstance(target, str)
diff --git a/py/torch_tensorrt/dynamo/conversion/impl/quantize.py b/py/torch_tensorrt/dynamo/conversion/impl/quantize.py
index c23fd55c5a..e472ed3092 100644
--- a/py/torch_tensorrt/dynamo/conversion/impl/quantize.py
+++ b/py/torch_tensorrt/dynamo/conversion/impl/quantize.py
@@ -45,9 +45,7 @@ def quantize(
             max_bound = 448
 
         amax = to_torch(amax, None)
-        print(f"=========== AMAX: {amax}")
         scale = torch.divide(amax, max_bound)
-        print(f"=========== SCALE: {scale}")
         scale = get_trt_tensor(ctx, scale, name + "_scale")
         # Add Q node
         quantize_layer = ctx.net.add_quantize(input_tensor, scale)
diff --git a/tests/py/dynamo/models/test_modelopt_models.py b/tests/py/dynamo/models/test_modelopt_models.py
index 36dce5af32..c2cd719bf9 100644
--- a/tests/py/dynamo/models/test_modelopt_models.py
+++ b/tests/py/dynamo/models/test_modelopt_models.py
@@ -68,7 +68,7 @@ def calibrate_loop(model):
 @unittest.skipIf(
     platform.system() != "Linux"
     or not importlib.util.find_spec("modelopt")
-    or Version(metadata.version("nvidia-modelopt")) < Version("0.17.0"),
+    or Version(metadata.version("nvidia-modelopt")) < Version("0.27.0"),
     "modelopt 0.17.0 or later is required, Int8 quantization is supported in modelopt since 0.17.0 or later for linux",
 )
 @pytest.mark.unit
@@ -102,7 +102,7 @@ def calibrate_loop(model):
 
     with torchtrt.logging.debug(), torch.no_grad():
         with export_torch_mode():
-            exp_program = torch.export.export(model, (input_tensor,))
+            exp_program = torch.export.export(model, (input_tensor,), strict=False)
             trt_model = torchtrt.dynamo.compile(
                 exp_program,
                 inputs=[input_tensor],
diff --git a/tests/py/requirements.txt b/tests/py/requirements.txt
index 4f3c4e083b..1925ef5839 100644
--- a/tests/py/requirements.txt
+++ b/tests/py/requirements.txt
@@ -9,5 +9,5 @@ pytest-xdist>=3.6.1
 pyyaml
 timm>=1.0.3
 transformers==4.49.0
-nvidia-modelopt[deploy,hf,torch]~=0.17.0; python_version < "3.13"
+nvidia-modelopt[deploy,hf,torch]~=0.27.0; python_version < "3.13"
 --extra-index-url https://pypi.nvidia.com

From d27759b9946e935e89d26fa1cb99ebc3533950de Mon Sep 17 00:00:00 2001
From: Dheeraj Peri <peri.dheeraj@gmail.com>
Date: Tue, 29 Apr 2025 12:17:30 -0700
Subject: [PATCH 12/13] chore: disable flashinfer rmsnorm test

---
 .../test_flashinfer_rmsnorm.py                | 34 ++++++++++---------
 1 file changed, 18 insertions(+), 16 deletions(-)

diff --git a/tests/py/dynamo/automatic_plugin/test_flashinfer_rmsnorm.py b/tests/py/dynamo/automatic_plugin/test_flashinfer_rmsnorm.py
index cf803c5ffa..fd5ed390ff 100644
--- a/tests/py/dynamo/automatic_plugin/test_flashinfer_rmsnorm.py
+++ b/tests/py/dynamo/automatic_plugin/test_flashinfer_rmsnorm.py
@@ -1,8 +1,6 @@
-import pytest
-
-flashinfer = pytest.importorskip("flashinfer")
 import unittest
 
+import pytest
 import torch
 import torch.nn as nn
 import torch_tensorrt
@@ -12,25 +10,29 @@
 
 from ..conversion.harness import DispatchTestCase
 
+# Toggle this flag to enable/disable flashinfer-based overrides
+enable_flashinfer: bool = False
+if enable_flashinfer:
+    import flashinfer
 
-@torch.library.custom_op("flashinfer::rmsnorm", mutates_args=())  # type: ignore[misc]
-def flashinfer_rmsnorm(
-    input: torch.Tensor, weight: torch.Tensor, eps: float = 1e-6
-) -> torch.Tensor:
-    return flashinfer.norm.rmsnorm(input, weight)
+    @torch.library.custom_op("flashinfer::rmsnorm", mutates_args=())  # type: ignore[misc]
+    def flashinfer_rmsnorm(
+        input: torch.Tensor, weight: torch.Tensor, eps: float = 1e-6
+    ) -> torch.Tensor:
+        return flashinfer.norm.rmsnorm(input, weight)
 
+    @torch.library.register_fake("flashinfer::rmsnorm")
+    def _(input: torch.Tensor, weight: torch.Tensor, b: float = 1e-6) -> torch.Tensor:
+        return input
 
-@torch.library.register_fake("flashinfer::rmsnorm")
-def _(input: torch.Tensor, weight: torch.Tensor, b: float = 1e-6) -> torch.Tensor:
-    return input
+    torch_tensorrt.dynamo.conversion.plugins.custom_op(
+        "flashinfer::rmsnorm", supports_dynamic_shapes=True
+    )
 
 
-torch_tensorrt.dynamo.conversion.plugins.custom_op(
-    "flashinfer::rmsnorm", supports_dynamic_shapes=True
+@unittest.skip(
+    "Flashinfer RMSNorm test is disabled due to error: SM75 support not available"
 )
-
-
-@unittest.skip("Not Available")
 class TestAutomaticPlugin(DispatchTestCase):
     @parameterized.expand(
         [

From 35defa84a94e3ae3fef593f8fb9c1c2089321bdb Mon Sep 17 00:00:00 2001
From: Dheeraj Peri <peri.dheeraj@gmail.com>
Date: Tue, 29 Apr 2025 13:49:29 -0700
Subject: [PATCH 13/13] chore: fix modelopt onnx dep

---
 tests/py/requirements.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/py/requirements.txt b/tests/py/requirements.txt
index 1925ef5839..94db519d28 100644
--- a/tests/py/requirements.txt
+++ b/tests/py/requirements.txt
@@ -9,5 +9,5 @@ pytest-xdist>=3.6.1
 pyyaml
 timm>=1.0.3
 transformers==4.49.0
-nvidia-modelopt[deploy,hf,torch]~=0.27.0; python_version < "3.13"
+nvidia-modelopt[all]~=0.27.0; python_version < "3.13"
 --extra-index-url https://pypi.nvidia.com