From d0ae5904390afded5ca665f7e2f42e4158184e51 Mon Sep 17 00:00:00 2001
From: cehongwang <wangcehong@gmail.com>
Date: Fri, 1 Aug 2025 21:31:03 +0000
Subject: [PATCH 1/5] Tentatively eliminate graph break overhead

---
 .../runtime/_PythonTorchTensorRTModule.py     | 83 ++++++++++---------
 1 file changed, 42 insertions(+), 41 deletions(-)

diff --git a/py/torch_tensorrt/dynamo/runtime/_PythonTorchTensorRTModule.py b/py/torch_tensorrt/dynamo/runtime/_PythonTorchTensorRTModule.py
index 1d619b6ce3..270c798b01 100644
--- a/py/torch_tensorrt/dynamo/runtime/_PythonTorchTensorRTModule.py
+++ b/py/torch_tensorrt/dynamo/runtime/_PythonTorchTensorRTModule.py
@@ -2,7 +2,6 @@
 
 import logging
 from contextlib import nullcontext
-from tempfile import tempdir
 from typing import Any, Dict, List, Optional, Sequence, Tuple
 
 import tensorrt as trt
@@ -218,7 +217,8 @@ def __init__(
         self.requires_output_allocator = requires_output_allocator
         self.output_allocator: Optional[DynamicOutputAllocator] = None
         self.use_output_allocator_outputs = False
-
+        self.device = torch.cuda.current_device()
+        self.cudagraphs_enabled = torch_tensorrt.runtime.get_cudagraphs_mode()
         if self.serialized_engine is not None and not self.settings.lazy_engine_init:
             self.setup_engine()
 
@@ -263,7 +263,12 @@ def setup_engine(self) -> None:
         assert (
             self.target_platform == Platform.current_platform()
         ), f"TensorRT engine was not built to target current platform (target: {self.target_platform}, current: {Platform.current_platform()})"
-
+        self._caller_stream = torch.cuda.current_stream()
+        if (
+            self._engine_stream == torch.cuda.default_stream()
+            or self._engine_stream is None
+        ):
+            self._engine_stream = torch.cuda.Stream()
         self.initialized = True
         runtime = trt.Runtime(TRT_LOGGER)
         self.engine = runtime.deserialize_cuda_engine(self.serialized_engine)
@@ -286,10 +291,14 @@ def setup_engine(self) -> None:
             for output_name in self.output_names
         ]
         self.output_shapes = [
-            self.engine.get_tensor_shape(output_name)
+            tuple(self.context.get_tensor_shape(output_name))
             for output_name in self.output_names
         ]
 
+        self.shape_key = "".join(
+            str(tuple(t)).replace(" ", "") for t in self.input_shapes
+        )
+
         if self.requires_output_allocator:
             self.create_output_allocator()
 
@@ -370,9 +379,9 @@ def setup_input_tensors(
                     + contiguous_inputs[i + 1 :]
                 )
 
-            assert (
-                contiguous_inputs[i].dtype == self.input_dtypes[i]
-            ), f"Dtype mismatch for {i}th input({input_name}). Expect {self.input_dtypes[i]}, got {contiguous_inputs[i].dtype}."
+            # assert (
+            #     contiguous_inputs[i].dtype == self.input_dtypes[i]
+            # ), f"Dtype mismatch for {i}th input({input_name}). Expect {self.input_dtypes[i]}, got {contiguous_inputs[i].dtype}."
 
             if need_cudagraphs_record:
                 # If cudagraphs is enabled, this memory is reserved for future cudagraph runs
@@ -409,7 +418,7 @@ def create_output_tensors(self) -> List[torch.Tensor]:
             output = torch.empty(
                 size=self.output_shapes[o],
                 dtype=self.output_dtypes[o],
-                device=torch.cuda.current_device(),
+                device=self.device,
             )
             outputs.append(output)
         return outputs
@@ -480,10 +489,10 @@ def run_standard_execution() -> torch.Tensor | Tuple[torch.Tensor, ...]:
                 if can_use_pre_allocated_outputs:
                     outputs = self.pre_allocated_outputs
                 else:
-                    self.output_shapes = [
-                        tuple(self.context.get_tensor_shape(output_name))
-                        for output_name in self.output_names
-                    ]
+                    # self.output_shapes = [
+                    #     tuple(self.context.get_tensor_shape(output_name))
+                    #     for output_name in self.output_names
+                    # ]
                     if DYNAMIC_DIM in self.output_shapes:
                         raise ValueError(
                             "Encountered dynamic output shapes during runtime. This could mean the network has data-dependent output shapes which is not currently supported."
@@ -510,42 +519,36 @@ def run_standard_execution() -> torch.Tensor | Tuple[torch.Tensor, ...]:
                 if self.profiling_enabled
                 else nullcontext()
             ):
-                self._caller_stream = torch.cuda.current_stream()
-                if (
-                    self._engine_stream == torch.cuda.default_stream()
-                    or self._engine_stream is None
-                ):
-                    self._engine_stream = torch.cuda.Stream()
 
                 self._engine_stream.wait_stream(self._caller_stream)
 
-                with torch.cuda.stream(self._engine_stream):
-                    if self.cudagraphs_enabled:
-                        if need_cudagraphs_record:
-                            self.cudagraph = torch.cuda.CUDAGraph()
+                # with torch.cuda.stream(self._engine_stream):
+                # if self.cudagraphs_enabled:
+                #     if need_cudagraphs_record:
+                #         self.cudagraph = torch.cuda.CUDAGraph()
 
-                            if self.profiling_enabled:
-                                self.cudagraph.enable_debug_mode()
+                #         if self.profiling_enabled:
+                #             self.cudagraph.enable_debug_mode()
 
-                            with torch.cuda.graph(
-                                self.cudagraph, stream=self._engine_stream
-                            ):
-                                self.context.execute_async_v3(
-                                    self._engine_stream.cuda_stream
-                                )
+                #         with torch.cuda.graph(
+                #             self.cudagraph, stream=self._engine_stream
+                #         ):
+                #             self.context.execute_async_v3(
+                #                 self._engine_stream.cuda_stream
+                #             )
 
-                            if self.profiling_enabled:
-                                import tempfile
+                #         if self.profiling_enabled:
+                #             import tempfile
 
-                                with tempfile.TemporaryDirectory() as tmpdir:
-                                    self.cudagraph.debug_dump(
-                                        f"{tempdir}/{self.name}_cudagraph.dot"
-                                    )
+                #             with tempfile.TemporaryDirectory() as tmpdir:
+                #                 self.cudagraph.debug_dump(
+                #                     f"{tempdir}/{self.name}_cudagraph.dot"
+                #                 )
 
-                        self.cudagraph.replay()  # type: ignore
+                #     self.cudagraph.replay()  # type: ignore
 
-                    else:
-                        self.context.execute_async_v3(self._engine_stream.cuda_stream)
+                # else:
+                self.context.execute_async_v3(self._engine_stream.cuda_stream)
 
                 self._caller_stream.wait_stream(self._engine_stream)
 
@@ -646,8 +649,6 @@ def run_output_allocator() -> torch.Tensor | Tuple[torch.Tensor, ...]:
 
             return outputs
 
-        self.cudagraphs_enabled = torch_tensorrt.runtime.get_cudagraphs_mode()
-
         # Run forward function
         contiguous_inputs: List[torch.Tensor] = [
             (i.contiguous() if isinstance(i, torch.Tensor) else torch.tensor(i).cuda())

From 56a89497f2a010173ee7eecf173924d2e2e637f1 Mon Sep 17 00:00:00 2001
From: cehongwang <wangcehong@gmail.com>
Date: Tue, 5 Aug 2025 00:19:05 +0000
Subject: [PATCH 2/5] Added stream manipulation and output tensor reusage

---
 .../runtime/_PythonTorchTensorRTModule.py     | 82 ++++++++++---------
 1 file changed, 44 insertions(+), 38 deletions(-)

diff --git a/py/torch_tensorrt/dynamo/runtime/_PythonTorchTensorRTModule.py b/py/torch_tensorrt/dynamo/runtime/_PythonTorchTensorRTModule.py
index 270c798b01..5fc1742a1b 100644
--- a/py/torch_tensorrt/dynamo/runtime/_PythonTorchTensorRTModule.py
+++ b/py/torch_tensorrt/dynamo/runtime/_PythonTorchTensorRTModule.py
@@ -173,6 +173,8 @@ def __init__(
         self.cudagraph: Optional[torch.cuda.CUDAGraph] = None
         self._caller_stream: Optional[torch.cuda.Stream] = None
         self._engine_stream: Optional[torch.cuda.Stream] = None
+        self.output_tensors: Optional[List[torch.Tensor]] = None
+        self.sync_stream = True
 
         # TODO: Make the below a Dictionary {shape: cudagraph}
         self.shape_key: Optional[str] = None
@@ -263,12 +265,16 @@ def setup_engine(self) -> None:
         assert (
             self.target_platform == Platform.current_platform()
         ), f"TensorRT engine was not built to target current platform (target: {self.target_platform}, current: {Platform.current_platform()})"
+        # Stream handling: if the caller stream is the pytorch default stream, create a new engine stream
+        # otherwise, use the caller stream and disable stream synchronization
         self._caller_stream = torch.cuda.current_stream()
-        if (
-            self._engine_stream == torch.cuda.default_stream()
-            or self._engine_stream is None
-        ):
+        if self._caller_stream == torch.cuda.default_stream():
             self._engine_stream = torch.cuda.Stream()
+            self.sync_stream = True
+        else:
+            self._engine_stream = self._caller_stream
+            self.sync_stream = False
+
         self.initialized = True
         runtime = trt.Runtime(TRT_LOGGER)
         self.engine = runtime.deserialize_cuda_engine(self.serialized_engine)
@@ -489,15 +495,14 @@ def run_standard_execution() -> torch.Tensor | Tuple[torch.Tensor, ...]:
                 if can_use_pre_allocated_outputs:
                     outputs = self.pre_allocated_outputs
                 else:
-                    # self.output_shapes = [
-                    #     tuple(self.context.get_tensor_shape(output_name))
-                    #     for output_name in self.output_names
-                    # ]
+
                     if DYNAMIC_DIM in self.output_shapes:
                         raise ValueError(
                             "Encountered dynamic output shapes during runtime. This could mean the network has data-dependent output shapes which is not currently supported."
                         )
-                    outputs = self.create_output_tensors()
+                    if self.output_tensors is None:
+                        self.output_tensors = self.create_output_tensors()
+                    outputs = self.output_tensors
 
                 for o, output_name in enumerate(self.output_names):
                     if need_cudagraphs_record:
@@ -520,37 +525,38 @@ def run_standard_execution() -> torch.Tensor | Tuple[torch.Tensor, ...]:
                 else nullcontext()
             ):
 
-                self._engine_stream.wait_stream(self._caller_stream)
+                if self.sync_stream:
+                    self._engine_stream.wait_stream(self._caller_stream)
 
-                # with torch.cuda.stream(self._engine_stream):
-                # if self.cudagraphs_enabled:
-                #     if need_cudagraphs_record:
-                #         self.cudagraph = torch.cuda.CUDAGraph()
+                if self.cudagraphs_enabled:
+                    if need_cudagraphs_record:
+                        self.cudagraph = torch.cuda.CUDAGraph()
 
-                #         if self.profiling_enabled:
-                #             self.cudagraph.enable_debug_mode()
+                        if self.profiling_enabled:
+                            self.cudagraph.enable_debug_mode()
 
-                #         with torch.cuda.graph(
-                #             self.cudagraph, stream=self._engine_stream
-                #         ):
-                #             self.context.execute_async_v3(
-                #                 self._engine_stream.cuda_stream
-                #             )
+                        with torch.cuda.graph(
+                            self.cudagraph, stream=self._engine_stream
+                        ):
+                            self.context.execute_async_v3(
+                                self._engine_stream.cuda_stream
+                            )
 
-                #         if self.profiling_enabled:
-                #             import tempfile
+                        if self.profiling_enabled:
+                            import tempfile
 
-                #             with tempfile.TemporaryDirectory() as tmpdir:
-                #                 self.cudagraph.debug_dump(
-                #                     f"{tempdir}/{self.name}_cudagraph.dot"
-                #                 )
+                            with tempfile.TemporaryDirectory() as tmpdir:
+                                self.cudagraph.debug_dump(
+                                    f"{tmpdir}/{self.name}_cudagraph.dot"
+                                )
 
-                #     self.cudagraph.replay()  # type: ignore
+                    self.cudagraph.replay()  # type: ignore
 
-                # else:
-                self.context.execute_async_v3(self._engine_stream.cuda_stream)
+                else:
+                    self.context.execute_async_v3(self._engine_stream.cuda_stream)
 
-                self._caller_stream.wait_stream(self._engine_stream)
+                if self.sync_stream:
+                    self._caller_stream.wait_stream(self._engine_stream)
 
             if self.use_pre_allocated_outputs:
                 self.pre_allocated_outputs = self.create_output_tensors()
@@ -753,13 +759,13 @@ def validate_input_shapes(self, inputs: Sequence[torch.Tensor]) -> bool:
         # Representation of input shapes to a given model
         # Shapes are concatenated as so:
         # x: (3, 4), y: (4, 5) --> Key: (3,4)(4,5)
-        tensor_inputs = []
-        for t in inputs:
-            if not isinstance(t, torch.Tensor):
-                return True
-            tensor_inputs.append(t)
+        if not all(isinstance(t, torch.Tensor) for t in inputs):
+            return True
+
         new_shape_key = "".join(
-            str(tuple(t.shape)).replace(" ", "") for t in tensor_inputs
+            str(tuple(t.shape)).replace(" ", "")
+            for t in inputs
+            if isinstance(t, torch.Tensor)
         )
 
         # If the new shape key differs from the existing one,

From 5fb0bebbb9fa0e9dee7430381e5a16dc46639afe Mon Sep 17 00:00:00 2001
From: cehongwang <wangcehong@gmail.com>
Date: Wed, 6 Aug 2025 20:59:55 +0000
Subject: [PATCH 3/5] Closed the graph break overhead in python

---
 py/torch_tensorrt/dynamo/_compiler.py                |  5 ++++-
 .../dynamo/runtime/_PythonTorchTensorRTModule.py     | 12 ++++++++++--
 2 files changed, 14 insertions(+), 3 deletions(-)

diff --git a/py/torch_tensorrt/dynamo/_compiler.py b/py/torch_tensorrt/dynamo/_compiler.py
index 74cab980c4..5f6f02d2b1 100644
--- a/py/torch_tensorrt/dynamo/_compiler.py
+++ b/py/torch_tensorrt/dynamo/_compiler.py
@@ -873,7 +873,7 @@ def preserve_module_specs(
     trt_modules = {}
     # Iterate over all components that can be accelerated
     # Generate the corresponding TRT Module for those
-
+    trt_module = None
     for name, _ in partitioned_module.named_children():
         submodule = getattr(partitioned_module, name)
         # filter on the GraphModule
@@ -994,6 +994,9 @@ def preserve_module_specs(
                     ) as f:
                         f.write(trt_module.get_layer_info())
 
+    if trt_module and settings.use_python_runtime:
+        trt_module.set_requires_unique_output(True)
+
     # Parse the graph I/O and store it in dryrun tracker
     parse_graph_io(gm, dryrun_tracker)
 
diff --git a/py/torch_tensorrt/dynamo/runtime/_PythonTorchTensorRTModule.py b/py/torch_tensorrt/dynamo/runtime/_PythonTorchTensorRTModule.py
index 5fc1742a1b..c501af58be 100644
--- a/py/torch_tensorrt/dynamo/runtime/_PythonTorchTensorRTModule.py
+++ b/py/torch_tensorrt/dynamo/runtime/_PythonTorchTensorRTModule.py
@@ -221,8 +221,16 @@ def __init__(
         self.use_output_allocator_outputs = False
         self.device = torch.cuda.current_device()
         self.cudagraphs_enabled = torch_tensorrt.runtime.get_cudagraphs_mode()
+        self.requires_unique_output = False
         if self.serialized_engine is not None and not self.settings.lazy_engine_init:
             self.setup_engine()
+        self.is_shape_inference_io = [
+            self.engine.is_shape_inference_io(input_name)
+            for input_name in self.input_names
+        ]
+
+    def set_requires_unique_output(self, requires_unique_output: bool) -> None:
+        self.requires_unique_output = requires_unique_output
 
     def get_streamable_device_memory_budget(self) -> Any:
         return self.engine.streamable_weights_size
@@ -396,7 +404,7 @@ def setup_input_tensors(
 
             # For shape tensors, we use CPU pointers and for data tensors, we use GPU pointers
             # as per TensorRT requirements
-            if self.engine.is_shape_inference_io(input_name):
+            if self.is_shape_inference_io[i]:
                 # Shape tensor inputs are casted to int64 explicitly
                 # Currently Torch CPU pointers are not working; numpy pointers are used instead
                 # to refer to underlying memory
@@ -500,7 +508,7 @@ def run_standard_execution() -> torch.Tensor | Tuple[torch.Tensor, ...]:
                         raise ValueError(
                             "Encountered dynamic output shapes during runtime. This could mean the network has data-dependent output shapes which is not currently supported."
                         )
-                    if self.output_tensors is None:
+                    if self.output_tensors is None or self.requires_unique_output:
                         self.output_tensors = self.create_output_tensors()
                     outputs = self.output_tensors
 

From 0046f66f1dd97de24ec245a4a517a04a7bab392f Mon Sep 17 00:00:00 2001
From: cehongwang <wangcehong@gmail.com>
Date: Wed, 6 Aug 2025 23:27:07 +0000
Subject: [PATCH 4/5] fixed a bug in dynamic shape

---
 .../dynamo/runtime/_PythonTorchTensorRTModule.py            | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/py/torch_tensorrt/dynamo/runtime/_PythonTorchTensorRTModule.py b/py/torch_tensorrt/dynamo/runtime/_PythonTorchTensorRTModule.py
index c501af58be..f87e278fa7 100644
--- a/py/torch_tensorrt/dynamo/runtime/_PythonTorchTensorRTModule.py
+++ b/py/torch_tensorrt/dynamo/runtime/_PythonTorchTensorRTModule.py
@@ -503,7 +503,11 @@ def run_standard_execution() -> torch.Tensor | Tuple[torch.Tensor, ...]:
                 if can_use_pre_allocated_outputs:
                     outputs = self.pre_allocated_outputs
                 else:
-
+                    if shape_changed:
+                        self.output_shapes = [
+                            tuple(self.context.get_tensor_shape(output_name))
+                            for output_name in self.output_names
+                        ]
                     if DYNAMIC_DIM in self.output_shapes:
                         raise ValueError(
                             "Encountered dynamic output shapes during runtime. This could mean the network has data-dependent output shapes which is not currently supported."

From 7259443f81586a22bc9e030b56e66ecd6b8ee07b Mon Sep 17 00:00:00 2001
From: cehongwang <wangcehong@gmail.com>
Date: Thu, 7 Aug 2025 20:30:25 +0000
Subject: [PATCH 5/5] Added some comments and an edge case

---
 py/torch_tensorrt/dynamo/_compiler.py         |  1 +
 .../runtime/_PythonTorchTensorRTModule.py     | 20 ++++++++++++++-----
 2 files changed, 16 insertions(+), 5 deletions(-)

diff --git a/py/torch_tensorrt/dynamo/_compiler.py b/py/torch_tensorrt/dynamo/_compiler.py
index 5f6f02d2b1..df239eeea2 100644
--- a/py/torch_tensorrt/dynamo/_compiler.py
+++ b/py/torch_tensorrt/dynamo/_compiler.py
@@ -994,6 +994,7 @@ def preserve_module_specs(
                     ) as f:
                         f.write(trt_module.get_layer_info())
 
+    # Only set the requires_unique_output flag for the last TRT Module when user has access to the output tensor
     if trt_module and settings.use_python_runtime:
         trt_module.set_requires_unique_output(True)
 
diff --git a/py/torch_tensorrt/dynamo/runtime/_PythonTorchTensorRTModule.py b/py/torch_tensorrt/dynamo/runtime/_PythonTorchTensorRTModule.py
index f87e278fa7..995835af94 100644
--- a/py/torch_tensorrt/dynamo/runtime/_PythonTorchTensorRTModule.py
+++ b/py/torch_tensorrt/dynamo/runtime/_PythonTorchTensorRTModule.py
@@ -378,6 +378,7 @@ def setup_input_tensors(
         contiguous_inputs: List[torch.Tensor],
         cudagraphs_enabled: bool,
         need_cudagraphs_record: bool,
+        shape_changed: bool = True,
     ) -> None:
         for i, input_name in enumerate(self.input_names):
             if not contiguous_inputs[i].is_cuda:
@@ -411,9 +412,10 @@ def setup_input_tensors(
                 inputs_cpu = contiguous_inputs[i].cpu().to(torch.int64).numpy().copy()
                 self.context.set_tensor_address(input_name, inputs_cpu.ctypes.data)
             else:
-                self.context.set_input_shape(
-                    input_name, tuple(contiguous_inputs[i].shape)
-                )
+                if shape_changed:
+                    self.context.set_input_shape(
+                        input_name, tuple(contiguous_inputs[i].shape)
+                    )
                 if cudagraphs_enabled:
                     self._input_buffers[i].copy_(contiguous_inputs[i])
                     self.context.set_tensor_address(
@@ -481,7 +483,11 @@ def run_standard_execution() -> torch.Tensor | Tuple[torch.Tensor, ...]:
                 ), f"Wrong number of inputs, expect {len(self.input_names)} get {len(contiguous_inputs)}."
 
                 self.setup_input_tensors(
-                    contiguous_inputs, self.cudagraphs_enabled, need_cudagraphs_record
+                    contiguous_inputs,
+                    self.cudagraphs_enabled,
+                    need_cudagraphs_record,
+                    shape_changed
+                    or self.output_tensors is None,  # First time execution
                 )
 
                 if shape_changed:
@@ -512,7 +518,11 @@ def run_standard_execution() -> torch.Tensor | Tuple[torch.Tensor, ...]:
                         raise ValueError(
                             "Encountered dynamic output shapes during runtime. This could mean the network has data-dependent output shapes which is not currently supported."
                         )
-                    if self.output_tensors is None or self.requires_unique_output:
+                    if (
+                        self.output_tensors is None
+                        or self.requires_unique_output
+                        or shape_changed
+                    ):
                         self.output_tensors = self.create_output_tensors()
                     outputs = self.output_tensors