fix: prelu perf gap on Unet (#3717)

zewenli98 · web-flow · commit 33a53cf1e7de · 2025-08-20T13:48:45.000-07:00
diff --git a/py/torch_tensorrt/dynamo/conversion/_TRTInterpreter.py b/py/torch_tensorrt/dynamo/conversion/_TRTInterpreter.py
@@ -440,7 +440,7 @@ def check_weight_equal(
             except Exception:
                 return torch.all(sd_weight == network_weight)
 
-    @needs_refit
+    @needs_refit  # type: ignore[misc]
     def _save_weight_mapping(self) -> None:
         """
         Construct the weight name mapping from engine weight name to state_dict weight name.
@@ -577,7 +577,7 @@ def _save_weight_mapping(self) -> None:
         gc.collect()
         torch.cuda.empty_cache()
 
-    @needs_refit
+    @needs_refit  # type: ignore[misc]
     def _insert_engine_to_cache(self, hash_val: str, serialized_engine: bytes) -> None:
         # TODO: @Evan is waiting for TRT's feature to cache the weight-stripped engine
         # if not self.compilation_settings.strip_engine_weights:
@@ -605,7 +605,7 @@ def _insert_engine_to_cache(self, hash_val: str, serialized_engine: bytes) -> No
             ),
         )
 
-    @needs_refit
+    @needs_refit  # type: ignore[misc]
     def _pull_cached_engine(self, hash_val: str) -> Optional[TRTInterpreterResult]:
         # query the cached TRT engine
         cached_data = self.engine_cache.check(hash_val)  # type: ignore[union-attr]
@@ -941,7 +941,14 @@ def output(self, target: str, args: Any, kwargs: Any) -> List[Any]:
                 f"Specified output dtypes ({len(self.output_dtypes)}) differ from number of outputs ({len(outputs)})"
             )
 
+        marked_outputs_ids = []
         for i, output in enumerate(outputs):
+            # In some cases, the same output tensor may be marked multiple times, such as _to_copy,
+            # so we skip marking if the output is already marked
+            if id(output) in marked_outputs_ids:
+                continue
+            marked_outputs_ids.append(id(output))
+
             name = f"output{i}"
 
             output_dtype = dtype.unknown
diff --git a/py/torch_tensorrt/dynamo/conversion/aten_ops_converters.py b/py/torch_tensorrt/dynamo/conversion/aten_ops_converters.py
@@ -1096,7 +1096,7 @@ def aten_ops_clone_copy_dtype(
         name,
         args[0],
         kwargs.get("dtype", args[0].dtype),
-        force_layer=True,
+        force_layer=False,  # force_layer=False results in better performance
     )
 
 
@@ -1228,7 +1228,7 @@ def aten_ops_sum(
             name,
             sum_,
             kwargs["output_dtype"],
-            force_layer=True,
+            force_layer=False,  # force_layer=False results in better performance
         )
     else:
         return sum_