pytorch
diff --git a/‎examples/dynamo/aot_plugin.py‎
Lines changed: 5 additions & 4 deletions b/‎examples/dynamo/aot_plugin.py‎
Lines changed: 5 additions & 4 deletions
diff --git a/‎examples/dynamo/auto_generate_converters.py‎
Lines changed: 5 additions & 4 deletions b/‎examples/dynamo/auto_generate_converters.py‎
Lines changed: 5 additions & 4 deletions
diff --git a/‎examples/dynamo/auto_generate_plugins.py‎
Lines changed: 5 additions & 4 deletions b/‎examples/dynamo/auto_generate_plugins.py‎
Lines changed: 5 additions & 4 deletions
diff --git a/‎examples/dynamo/converter_overloading.py‎
Lines changed: 8 additions & 8 deletions b/‎examples/dynamo/converter_overloading.py‎
Lines changed: 8 additions & 8 deletions
diff --git a/‎examples/dynamo/cross_runtime_compilation_for_windows.py‎
Lines changed: 3 additions & 2 deletions b/‎examples/dynamo/cross_runtime_compilation_for_windows.py‎
Lines changed: 3 additions & 2 deletions
diff --git a/‎examples/dynamo/custom_kernel_plugins.py‎
Lines changed: 7 additions & 4 deletions b/‎examples/dynamo/custom_kernel_plugins.py‎
Lines changed: 7 additions & 4 deletions
diff --git a/‎examples/dynamo/engine_caching_bert_example.py‎
Lines changed: 2 additions & 1 deletion b/‎examples/dynamo/engine_caching_bert_example.py‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎examples/dynamo/engine_caching_example.py‎
Lines changed: 5 additions & 3 deletions b/‎examples/dynamo/engine_caching_example.py‎
Lines changed: 5 additions & 3 deletions
diff --git a/‎examples/dynamo/hierarchical_partitioner_example.py‎
Lines changed: 2 additions & 1 deletion b/‎examples/dynamo/hierarchical_partitioner_example.py‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎examples/dynamo/llama2_flashinfer_rmsnorm.py‎
Lines changed: 3 additions & 2 deletions b/‎examples/dynamo/llama2_flashinfer_rmsnorm.py‎
Lines changed: 3 additions & 2 deletions
@@ -153,7 +153,7 @@ def forward(self, X: torch.Tensor) -> torch.Tensor:
     )
     args = parser.parse_args()
 
-    my_model = MyModel().to("cuda")
+    my_model = MyModel().to("cuda").eval()
     m = torch.full((64, 64), 2, device="cuda", dtype=torch.float)
 
     assert my_model(X=m)[0][0] == 3.0
@@ -167,8 +167,9 @@ def forward(self, X: torch.Tensor) -> torch.Tensor:
         )
         print("Model compiled successfully!")
         print("Running inference with compiled model...")
-        for i in range(10):
-            res = model_trt(m)
-            assert torch.allclose(res, my_model(m)), "Results do not match!"
+        with torch.no_grad():
+            for i in range(10):
+                res = model_trt(m)
+                assert torch.allclose(res, my_model(m)), "Results do not match!"
 
     print("Inference successful!")
@@ -169,14 +169,15 @@ def forward(self, x: torch.Tensor, y: torch.Tensor) -> torch.Tensor:
         return res
 
 
-my_model = MyModel().to("cuda")
+my_model = MyModel().to("cuda").eval()
 m = torch.full((64, 64), 2, device="cuda", dtype=torch.float)
 n = torch.full((64, 64), 3, device="cuda", dtype=torch.float)
 
 with torch_tensorrt.logging.errors():
     model_trt = torch_tensorrt.compile(my_model, inputs=[m, n], min_block_size=1)
-    for i in range(300):
-        res = model_trt(m, n)
-        assert torch.allclose(res, my_model(m, n))
+    with torch.no_grad():
+        for i in range(300):
+            res = model_trt(m, n)
+            assert torch.allclose(res, my_model(m, n))
 
 print("Ran with custom plugin!")
@@ -139,14 +139,15 @@ def forward(self, x: torch.Tensor, y: torch.Tensor) -> torch.Tensor:
         return res
 
 
-my_model = MyModel().to("cuda")
+my_model = MyModel().to("cuda").eval()
 m = torch.randint(0, 5, (64, 64), device="cuda", dtype=torch.float)
 n = torch.randint(0, 5, (64, 64), device="cuda", dtype=torch.float)
 
 with torch_tensorrt.logging.errors():
     model_trt = torch_tensorrt.compile(my_model, inputs=[m, n], min_block_size=1)
-    for i in range(300):
-        res = model_trt(m, n)
-        assert torch.allclose(res, my_model(m, n))
+    with torch.no_grad():
+        for i in range(300):
+            res = model_trt(m, n)
+            assert torch.allclose(res, my_model(m, n))
 
 print("Ran with custom plugin!")
@@ -34,7 +34,7 @@ def forward(self, x):
         return torch.nn.functional.gelu(x, approximate=self.mode)
 
 
-my_mod = GeLU(mode="tanh")
+my_mod = GeLU(mode="tanh").to("cuda").eval()
 ex_input = torch.randn(2, 5).to("cuda")
 
 
@@ -182,9 +182,9 @@ def get_op_count():
 my_custom_gelu = torch_tensorrt.compile(
     my_mod, arg_inputs=(ex_input,), min_block_size=1
 )
-
-print(my_custom_gelu.graph)
-print(my_custom_gelu(ex_input))
+with torch.no_grad():
+    print(my_custom_gelu.graph)
+    print(my_custom_gelu(ex_input))
 
 # %%
 #
@@ -198,7 +198,7 @@ def get_op_count():
 #
 # Finally, we want to verify that in the case that the ``approximate`` argument is not set to ``tanh``, our custom converter is not used.
 
-my_mod_erf = GeLU(mode="none")
+my_mod_erf = GeLU(mode="none").to("cuda").eval()
 my_gelu_erf = torch_tensorrt.compile(
     my_mod_erf, arg_inputs=(ex_input,), min_block_size=1
 )
@@ -207,6 +207,6 @@ def get_op_count():
 #
 # Notice that we don't see the print statement from our custom converter, indicating that it was not used. However, looking at the graph, we can still see that a TensorRT engine was created to run the GeLU operation.
 # In this case, the validator for our custom converter returned ``False``, so the conversion system moved on to the next converter in the list, the standard GeLU converter and used that one to convert the operation.
-
-print(my_gelu_erf.graph)
-print(my_gelu_erf(ex_input))
+with torch.no_grad():
+    print(my_gelu_erf.graph)
+    print(my_gelu_erf(ex_input))
@@ -46,7 +46,7 @@
 
 args = PARSER.parse_args()
 torch.manual_seed(0)
-model = models.resnet18().eval().cuda()
+model = models.resnet18().cuda().eval()
 input = torch.rand((1, 3, 224, 224)).to("cuda")
 inputs = [input]
 
@@ -63,7 +63,8 @@
     loaded_model = torchtrt.load_cross_compiled_exported_program(args.path).module()
     print(f"model has been successfully loaded from ${args.path}")
     # inference
-    trt_output = loaded_model(input)
+    with torch.no_grad():
+        trt_output = loaded_model(input)
     print(f"inference result: {trt_output}")
 else:
     if platform.system() != "Linux" or platform.architecture()[0] != "64bit":
 
@@ -217,8 +217,9 @@ def forward(self, x: torch.Tensor) -> torch.Tensor:
         return y
 
 
-my_model = MyModel((1, 1, 2, 0)).to("cuda")
-my_model(ex_input)
+my_model = MyModel((1, 1, 2, 0)).to("cuda").eval()
+with torch.no_grad():
+    my_model(ex_input)
 
 ##############################################################################
 # .. code-block:: none
@@ -607,7 +608,8 @@ def circular_padding_converter(
 ##############################################
 # As you can see, now there is only one subgraph created for the TensorRT engine that contains both our custom kernel and the native convolution operator.
 
-print(trt_model(ex_input))
+with torch.no_grad():
+    print(trt_model(ex_input))
 
 ##############################################################################
 #    .. code-block:: none
@@ -636,7 +638,8 @@ def circular_padding_converter(
 # %%
 # We can verify our implementation is run correctly by both TensorRT and PyTorch
 
-print(my_model(ex_input) - trt_model(ex_input))
+with torch.no_grad():
+    print(my_model(ex_input) - trt_model(ex_input))
 
 ##############################################################################
 # .. code-block:: none
 
@@ -62,7 +62,8 @@ def compile_bert(iterations=3):
             backend="torch_tensorrt",
             options=compilation_kwargs,
         )
-        optimized_model(*inputs)
+        with torch.no_grad():
+            optimized_model(*inputs)
         end.record()
         torch.cuda.synchronize()
         times.append(start.elapsed_time(end))
 
@@ -37,7 +37,7 @@
 np.random.seed(0)
 torch.manual_seed(0)
 
-model = models.resnet18(pretrained=True).eval().to("cuda")
+model = models.resnet18(pretrained=True).to("cuda").eval()
 enabled_precisions = {torch.float}
 min_block_size = 1
 use_python_runtime = False
@@ -100,7 +100,8 @@ def torch_compile(iterations=3):
                 "reuse_cached_engines": reuse_cached_engines,
             },
         )
-        compiled_model(*inputs)  # trigger the compilation
+        with torch.no_grad():
+            compiled_model(*inputs)  # trigger the compilation
         end.record()
         torch.cuda.synchronize()
         times.append(start.elapsed_time(end))
@@ -270,7 +271,8 @@ def torch_compile_my_cache(iterations=3):
                 "custom_engine_cache": engine_cache,
             },
         )
-        compiled_model(*inputs)  # trigger the compilation
+        with torch.no_grad():
+            compiled_model(*inputs)  # trigger the compilation
         end.record()
         torch.cuda.synchronize()
         times.append(start.elapsed_time(end))
 
@@ -79,7 +79,8 @@ def main():
 
     print("Original Model Structure:\n", gm)
 
-    original_output = model(example_input)
+    with torch.no_grad():
+        original_output = model(example_input)
 
     # 1. Partition the model into blocks that can be executed by different backends
     partitioned_model, op_support = hierarchical_adjacency_partition(
 
@@ -220,7 +220,7 @@ def replace_rmsnorm(
 
 # 2. Initialize model (random weights)
 with torch.no_grad():
-    model = LlamaForCausalLM(config).eval().half()
+    model = LlamaForCausalLM(config).cuda().half().eval()
 
 # 3. Export with static shapes
 input_ids = torch.randint(0, 32000, (1, 64))  # Static [batch=1, seq=64]
@@ -253,5 +253,6 @@ def replace_rmsnorm(
 
 input_ids = input_ids.to(DEVICE)
 
-res = trt_model.forward(input_ids)
+with torch.no_grad():
+    res = trt_model.forward(input_ids)
 print(res)