pytorch · zewenli98 · Aug 20, 2025 · Aug 9, 2025 · Aug 13, 2025
diff --git a/examples/dynamo/aot_plugin.py b/examples/dynamo/aot_plugin.py
@@ -153,7 +153,7 @@ def forward(self, X: torch.Tensor) -> torch.Tensor:
     )
     args = parser.parse_args()
 
-    my_model = MyModel().to("cuda")
+    my_model = MyModel().to("cuda").eval()
     m = torch.full((64, 64), 2, device="cuda", dtype=torch.float)
 
     assert my_model(X=m)[0][0] == 3.0

diff --git a/examples/dynamo/auto_generate_converters.py b/examples/dynamo/auto_generate_converters.py
@@ -169,7 +169,7 @@ def forward(self, x: torch.Tensor, y: torch.Tensor) -> torch.Tensor:
         return res
 
 
-my_model = MyModel().to("cuda")
+my_model = MyModel().to("cuda").eval()
 m = torch.full((64, 64), 2, device="cuda", dtype=torch.float)
 n = torch.full((64, 64), 3, device="cuda", dtype=torch.float)
 

diff --git a/examples/dynamo/auto_generate_plugins.py b/examples/dynamo/auto_generate_plugins.py
@@ -139,7 +139,7 @@ def forward(self, x: torch.Tensor, y: torch.Tensor) -> torch.Tensor:
         return res
 
 
-my_model = MyModel().to("cuda")
+my_model = MyModel().to("cuda").eval()
 m = torch.randint(0, 5, (64, 64), device="cuda", dtype=torch.float)
 n = torch.randint(0, 5, (64, 64), device="cuda", dtype=torch.float)
 

diff --git a/examples/dynamo/converter_overloading.py b/examples/dynamo/converter_overloading.py
@@ -34,7 +34,7 @@ def forward(self, x):
         return torch.nn.functional.gelu(x, approximate=self.mode)
 
 
-my_mod = GeLU(mode="tanh")
+my_mod = GeLU(mode="tanh").to("cuda").eval()
 ex_input = torch.randn(2, 5).to("cuda")
 
 
@@ -198,7 +198,7 @@ def get_op_count():
 #
 # Finally, we want to verify that in the case that the ``approximate`` argument is not set to ``tanh``, our custom converter is not used.
 
-my_mod_erf = GeLU(mode="none")
+my_mod_erf = GeLU(mode="none").to("cuda").eval()
 my_gelu_erf = torch_tensorrt.compile(
     my_mod_erf, arg_inputs=(ex_input,), min_block_size=1
 )

diff --git a/examples/dynamo/cross_runtime_compilation_for_windows.py b/examples/dynamo/cross_runtime_compilation_for_windows.py
@@ -46,7 +46,7 @@
 
 args = PARSER.parse_args()
 torch.manual_seed(0)
-model = models.resnet18().eval().cuda()
+model = models.resnet18().cuda().eval()
 input = torch.rand((1, 3, 224, 224)).to("cuda")
 inputs = [input]
 

diff --git a/examples/dynamo/custom_kernel_plugins.py b/examples/dynamo/custom_kernel_plugins.py
@@ -217,7 +217,7 @@ def forward(self, x: torch.Tensor) -> torch.Tensor:
         return y
 
 
-my_model = MyModel((1, 1, 2, 0)).to("cuda")
+my_model = MyModel((1, 1, 2, 0)).to("cuda").eval()
 my_model(ex_input)
 
 ##############################################################################

diff --git a/examples/dynamo/engine_caching_example.py b/examples/dynamo/engine_caching_example.py
@@ -37,7 +37,7 @@
 np.random.seed(0)
 torch.manual_seed(0)
 
-model = models.resnet18(pretrained=True).eval().to("cuda")
+model = models.resnet18(pretrained=True).to("cuda").eval()
 enabled_precisions = {torch.float}
 min_block_size = 1
 use_python_runtime = False

diff --git a/examples/dynamo/llama2_flashinfer_rmsnorm.py b/examples/dynamo/llama2_flashinfer_rmsnorm.py
@@ -220,7 +220,7 @@ def replace_rmsnorm(
 
 # 2. Initialize model (random weights)
 with torch.no_grad():
-    model = LlamaForCausalLM(config).eval().half()
+    model = LlamaForCausalLM(config).cuda().half().eval()
 
 # 3. Export with static shapes
 input_ids = torch.randint(0, 32000, (1, 64))  # Static [batch=1, seq=64]

diff --git a/examples/dynamo/mutable_torchtrt_module_example.py b/examples/dynamo/mutable_torchtrt_module_example.py
@@ -37,7 +37,7 @@
     "immutable_weights": False,
 }
 
-model = models.resnet18(pretrained=True).eval().to("cuda")
+model = models.resnet18(pretrained=True).to("cuda").eval()
 mutable_module = torch_trt.MutableTorchTensorRTModule(model, **settings)
 # You can use the mutable module just like the original pytorch module. The compilation happens while you first call the mutable module.
 mutable_module(*inputs)
@@ -47,7 +47,7 @@
 
 # %%
 # Making changes to mutable module can trigger refit or re-compilation. For example, loading a different state_dict and setting new weight values will trigger refit, and adding a module to the model will trigger re-compilation.
-model2 = models.resnet18(pretrained=False).eval().to("cuda")
+model2 = models.resnet18(pretrained=False).to("cuda").eval()
 mutable_module.load_state_dict(model2.state_dict())
 
 
@@ -163,7 +163,7 @@ def forward(self, a, b, c={}):
 
 
 device = "cuda:0"
-model = Model().eval().to(device)
+model = Model().to(device).eval()
 inputs = (torch.rand(10, 3).to(device), torch.rand(3, 30).to(device))
 kwargs = {
     "c": {"a": torch.rand(10, 30).to(device), "b": torch.rand(10, 30).to(device)},
@@ -199,7 +199,7 @@ def forward(self, a, b, c={}):
 
 from torch_tensorrt.dynamo._defaults import TIMING_CACHE_PATH
 
-model = models.resnet18(pretrained=True).eval().to("cuda")
+model = models.resnet18(pretrained=True).to("cuda").eval()
 
 times = []
 start = torch.cuda.Event(enable_timing=True)

diff --git a/examples/dynamo/pre_allocated_output_example.py b/examples/dynamo/pre_allocated_output_example.py
@@ -67,9 +67,9 @@ def test_module_perf(model, *input):
 # Load bert model
 model = (
     BertModel.from_pretrained("bert-base-uncased", torchscript=True)
-    .eval()
-    .half()
     .to("cuda")
+    .half()
+    .eval()
 )
 # Define sample inputs
 inputs = [

diff --git a/examples/dynamo/refit_engine_example.py b/examples/dynamo/refit_engine_example.py
@@ -53,7 +53,7 @@
 #
 # In this case we are going to compile a ResNet18 model with randomly initialized weights and save it.
 
-model = models.resnet18(pretrained=False).eval().to("cuda")
+model = models.resnet18(pretrained=False).to("cuda").eval()
 exp_program = torch.export.export(model, tuple(inputs))
 enabled_precisions = {torch.float}
 workspace_size = 20 << 30
@@ -85,7 +85,7 @@
 # function is used to update the weights of the compiled module with the new weights.
 
 # Create and compile the updated model
-model2 = models.resnet18(pretrained=True).eval().to("cuda")
+model2 = models.resnet18(pretrained=True).to("cuda").eval()
 exp_program2 = torch.export.export(model2, tuple(inputs))
 
 
@@ -99,7 +99,6 @@
 )
 
 # Check the output
-model2.to("cuda")
 expected_outputs, refitted_outputs = exp_program2.module()(*inputs), new_trt_gm(*inputs)
 for expected_output, refitted_output in zip(expected_outputs, refitted_outputs):
     assert torch.allclose(

diff --git a/examples/dynamo/torch_compile_advanced_usage.py b/examples/dynamo/torch_compile_advanced_usage.py
@@ -36,7 +36,7 @@ def forward(self, x: torch.Tensor, y: torch.Tensor):
 
 # Define sample float inputs and initialize model
 sample_inputs = [torch.rand((5, 7)).cuda(), torch.rand((5, 7)).cuda()]
-model = Model().eval().cuda()
+model = Model().cuda().eval()
 
 # %%
 
@@ -60,7 +60,7 @@ def forward(self, x: torch.Tensor, y: torch.Tensor):
     torch.rand((5, 7)).half().cuda(),
     torch.rand((5, 7)).half().cuda(),
 ]
-model_half = Model().eval().cuda()
+model_half = Model().cuda().eval()
 
 # %%
 

diff --git a/examples/dynamo/torch_compile_gpt2.py b/examples/dynamo/torch_compile_gpt2.py
@@ -44,8 +44,8 @@
             use_cache=False,
             attn_implementation="eager",
         )
+        .to(DEVICE)
         .eval()
-        .cuda()
     )
 
 # %%
@@ -54,7 +54,7 @@
 # Tokenize a sample input prompt and get pytorch model outputs
 prompt = "I enjoy walking with my cute dog"
 model_inputs = tokenizer(prompt, return_tensors="pt")
-input_ids = model_inputs["input_ids"].cuda()
+input_ids = model_inputs["input_ids"].to(DEVICE)
 
 # %%
 # The ``generate()`` API of the ``AutoModelForCausalLM`` class is used for auto-regressive generation with greedy decoding.

diff --git a/examples/dynamo/torch_compile_resnet_example.py b/examples/dynamo/torch_compile_resnet_example.py
@@ -18,7 +18,7 @@
 # %%
 
 # Initialize model with half precision and sample inputs
-model = models.resnet18(pretrained=True).half().eval().to("cuda")
+model = models.resnet18(pretrained=True).to("cuda").half().eval()
 inputs = [torch.randn((1, 3, 224, 224)).to("cuda").half()]
 
 # %%
@@ -63,21 +63,21 @@
 # ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
 # Does not cause recompilation (same batch size as input)
-new_inputs = [torch.randn((1, 3, 224, 224)).half().to("cuda")]
+new_inputs = [torch.randn((1, 3, 224, 224)).to("cuda").half()]
 new_outputs = optimized_model(*new_inputs)
 
 # %%
 
 # Does cause recompilation (new batch size)
-new_batch_size_inputs = [torch.randn((8, 3, 224, 224)).half().to("cuda")]
+new_batch_size_inputs = [torch.randn((8, 3, 224, 224)).to("cuda").half()]
 new_batch_size_outputs = optimized_model(*new_batch_size_inputs)
 
 # %%
 # Avoid recompilation by specifying dynamic shapes before Torch-TRT compilation
 # ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
 # The following code illustrates the workflow using ir=torch_compile (which uses torch.compile under the hood)
-inputs_bs8 = torch.randn((8, 3, 224, 224)).half().to("cuda")
+inputs_bs8 = torch.randn((8, 3, 224, 224)).to("cuda").half()
 # This indicates dimension 0 of inputs_bs8 is dynamic whose range of values is [2, 16]
 torch._dynamo.mark_dynamic(inputs_bs8, 0, min=2, max=16)
 optimized_model = torch_tensorrt.compile(
@@ -92,7 +92,7 @@
 outputs_bs8 = optimized_model(inputs_bs8)
 
 # No recompilation happens for batch size = 12
-inputs_bs12 = torch.randn((12, 3, 224, 224)).half().to("cuda")
+inputs_bs12 = torch.randn((12, 3, 224, 224)).to("cuda").half()
 outputs_bs12 = optimized_model(inputs_bs12)
 
 # The following code illustrates the workflow using ir=dynamo (which uses torch.export APIs under the hood)
@@ -112,5 +112,5 @@
 trt_model = torch_tensorrt.compile(model, **compile_spec)
 
 # No recompilation happens for batch size = 12
-inputs_bs12 = torch.randn((12, 3, 224, 224)).half().to("cuda")
+inputs_bs12 = torch.randn((12, 3, 224, 224)).to("cuda").half()
 outputs_bs12 = trt_model(inputs_bs12)
diff --git a/examples/dynamo/torch_compile_transformers_example.py b/examples/dynamo/torch_compile_transformers_example.py
@@ -18,7 +18,7 @@
 # %%
 
 # Initialize model with float precision and sample inputs
-model = BertModel.from_pretrained("bert-base-uncased").eval().to("cuda")
+model = BertModel.from_pretrained("bert-base-uncased").to("cuda").eval()
 inputs = [
     torch.randint(0, 2, (1, 14), dtype=torch.int32).to("cuda"),
     torch.randint(0, 2, (1, 14), dtype=torch.int32).to("cuda"),

diff --git a/examples/dynamo/torch_export_cudagraphs.py b/examples/dynamo/torch_export_cudagraphs.py
@@ -25,7 +25,7 @@
 # ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
 # We begin by defining and initializing a model
-model = models.resnet18(pretrained=True).eval().to("cuda")
+model = models.resnet18(pretrained=True).cuda().eval()
 
 # Define sample inputs
 inputs = torch.randn((16, 3, 224, 224)).cuda()
@@ -101,8 +101,8 @@ def forward(self, x):
         return torch.relu((x + 2) * 0.5)
 
 
-model = SampleModel().eval().cuda()
-input = torch.randn((1, 3, 224, 224)).to("cuda")
+model = SampleModel().cuda().eval()
+input = torch.randn((1, 3, 224, 224)).cuda()
 
 # The 'torch_executed_ops' compiler option is used in this example to intentionally introduce graph breaks within the module.
 # Note: The Dynamo backend is required for the CUDA Graph context manager to handle modules in an Ahead-Of-Time (AOT) manner.

diff --git a/examples/dynamo/torch_export_sam2.py b/examples/dynamo/torch_export_sam2.py
@@ -110,8 +110,8 @@ def forward(self, image, point_coords, point_labels):
 # Initialize the ``SAM2FullModel`` with the pretrained weights. Since we already initialized
 # ``SAM2ImagePredictor``, we can directly use the model from it (``predictor.model``). We cast the model
 # to FP16 precision for faster performance.
-encoder = predictor.model.eval().cuda()
-sam_model = SAM2FullModel(encoder.half()).eval().cuda()
+encoder = predictor.model.cuda().eval()
+sam_model = SAM2FullModel(encoder.half()).cuda().eval()
 
 # %%
 # Load a sample image provided in the repository.

diff --git a/examples/dynamo/vgg16_ptq.py b/examples/dynamo/vgg16_ptq.py
@@ -120,7 +120,7 @@ def vgg16(num_classes=1000, init_weights=False):
 args = PARSER.parse_args()
 
 model = vgg16(num_classes=10, init_weights=False)
-model = model.cuda()
+model = model.cuda().eval()
 
 # %%
 # Load the pre-trained model weights