From d0ae5904390afded5ca665f7e2f42e4158184e51 Mon Sep 17 00:00:00 2001 From: cehongwang Date: Fri, 1 Aug 2025 21:31:03 +0000 Subject: [PATCH 1/5] Tentatively eliminate graph break overhead --- .../runtime/_PythonTorchTensorRTModule.py | 83 ++++++++++--------- 1 file changed, 42 insertions(+), 41 deletions(-) diff --git a/py/torch_tensorrt/dynamo/runtime/_PythonTorchTensorRTModule.py b/py/torch_tensorrt/dynamo/runtime/_PythonTorchTensorRTModule.py index 1d619b6ce3..270c798b01 100644 --- a/py/torch_tensorrt/dynamo/runtime/_PythonTorchTensorRTModule.py +++ b/py/torch_tensorrt/dynamo/runtime/_PythonTorchTensorRTModule.py @@ -2,7 +2,6 @@ import logging from contextlib import nullcontext -from tempfile import tempdir from typing import Any, Dict, List, Optional, Sequence, Tuple import tensorrt as trt @@ -218,7 +217,8 @@ def __init__( self.requires_output_allocator = requires_output_allocator self.output_allocator: Optional[DynamicOutputAllocator] = None self.use_output_allocator_outputs = False - + self.device = torch.cuda.current_device() + self.cudagraphs_enabled = torch_tensorrt.runtime.get_cudagraphs_mode() if self.serialized_engine is not None and not self.settings.lazy_engine_init: self.setup_engine() @@ -263,7 +263,12 @@ def setup_engine(self) -> None: assert ( self.target_platform == Platform.current_platform() ), f"TensorRT engine was not built to target current platform (target: {self.target_platform}, current: {Platform.current_platform()})" - + self._caller_stream = torch.cuda.current_stream() + if ( + self._engine_stream == torch.cuda.default_stream() + or self._engine_stream is None + ): + self._engine_stream = torch.cuda.Stream() self.initialized = True runtime = trt.Runtime(TRT_LOGGER) self.engine = runtime.deserialize_cuda_engine(self.serialized_engine) @@ -286,10 +291,14 @@ def setup_engine(self) -> None: for output_name in self.output_names ] self.output_shapes = [ - self.engine.get_tensor_shape(output_name) + tuple(self.context.get_tensor_shape(output_name)) for output_name in self.output_names ] + self.shape_key = "".join( + str(tuple(t)).replace(" ", "") for t in self.input_shapes + ) + if self.requires_output_allocator: self.create_output_allocator() @@ -370,9 +379,9 @@ def setup_input_tensors( + contiguous_inputs[i + 1 :] ) - assert ( - contiguous_inputs[i].dtype == self.input_dtypes[i] - ), f"Dtype mismatch for {i}th input({input_name}). Expect {self.input_dtypes[i]}, got {contiguous_inputs[i].dtype}." + # assert ( + # contiguous_inputs[i].dtype == self.input_dtypes[i] + # ), f"Dtype mismatch for {i}th input({input_name}). Expect {self.input_dtypes[i]}, got {contiguous_inputs[i].dtype}." if need_cudagraphs_record: # If cudagraphs is enabled, this memory is reserved for future cudagraph runs @@ -409,7 +418,7 @@ def create_output_tensors(self) -> List[torch.Tensor]: output = torch.empty( size=self.output_shapes[o], dtype=self.output_dtypes[o], - device=torch.cuda.current_device(), + device=self.device, ) outputs.append(output) return outputs @@ -480,10 +489,10 @@ def run_standard_execution() -> torch.Tensor | Tuple[torch.Tensor, ...]: if can_use_pre_allocated_outputs: outputs = self.pre_allocated_outputs else: - self.output_shapes = [ - tuple(self.context.get_tensor_shape(output_name)) - for output_name in self.output_names - ] + # self.output_shapes = [ + # tuple(self.context.get_tensor_shape(output_name)) + # for output_name in self.output_names + # ] if DYNAMIC_DIM in self.output_shapes: raise ValueError( "Encountered dynamic output shapes during runtime. This could mean the network has data-dependent output shapes which is not currently supported." @@ -510,42 +519,36 @@ def run_standard_execution() -> torch.Tensor | Tuple[torch.Tensor, ...]: if self.profiling_enabled else nullcontext() ): - self._caller_stream = torch.cuda.current_stream() - if ( - self._engine_stream == torch.cuda.default_stream() - or self._engine_stream is None - ): - self._engine_stream = torch.cuda.Stream() self._engine_stream.wait_stream(self._caller_stream) - with torch.cuda.stream(self._engine_stream): - if self.cudagraphs_enabled: - if need_cudagraphs_record: - self.cudagraph = torch.cuda.CUDAGraph() + # with torch.cuda.stream(self._engine_stream): + # if self.cudagraphs_enabled: + # if need_cudagraphs_record: + # self.cudagraph = torch.cuda.CUDAGraph() - if self.profiling_enabled: - self.cudagraph.enable_debug_mode() + # if self.profiling_enabled: + # self.cudagraph.enable_debug_mode() - with torch.cuda.graph( - self.cudagraph, stream=self._engine_stream - ): - self.context.execute_async_v3( - self._engine_stream.cuda_stream - ) + # with torch.cuda.graph( + # self.cudagraph, stream=self._engine_stream + # ): + # self.context.execute_async_v3( + # self._engine_stream.cuda_stream + # ) - if self.profiling_enabled: - import tempfile + # if self.profiling_enabled: + # import tempfile - with tempfile.TemporaryDirectory() as tmpdir: - self.cudagraph.debug_dump( - f"{tempdir}/{self.name}_cudagraph.dot" - ) + # with tempfile.TemporaryDirectory() as tmpdir: + # self.cudagraph.debug_dump( + # f"{tempdir}/{self.name}_cudagraph.dot" + # ) - self.cudagraph.replay() # type: ignore + # self.cudagraph.replay() # type: ignore - else: - self.context.execute_async_v3(self._engine_stream.cuda_stream) + # else: + self.context.execute_async_v3(self._engine_stream.cuda_stream) self._caller_stream.wait_stream(self._engine_stream) @@ -646,8 +649,6 @@ def run_output_allocator() -> torch.Tensor | Tuple[torch.Tensor, ...]: return outputs - self.cudagraphs_enabled = torch_tensorrt.runtime.get_cudagraphs_mode() - # Run forward function contiguous_inputs: List[torch.Tensor] = [ (i.contiguous() if isinstance(i, torch.Tensor) else torch.tensor(i).cuda()) From 56a89497f2a010173ee7eecf173924d2e2e637f1 Mon Sep 17 00:00:00 2001 From: cehongwang Date: Tue, 5 Aug 2025 00:19:05 +0000 Subject: [PATCH 2/5] Added stream manipulation and output tensor reusage --- .../runtime/_PythonTorchTensorRTModule.py | 82 ++++++++++--------- 1 file changed, 44 insertions(+), 38 deletions(-) diff --git a/py/torch_tensorrt/dynamo/runtime/_PythonTorchTensorRTModule.py b/py/torch_tensorrt/dynamo/runtime/_PythonTorchTensorRTModule.py index 270c798b01..5fc1742a1b 100644 --- a/py/torch_tensorrt/dynamo/runtime/_PythonTorchTensorRTModule.py +++ b/py/torch_tensorrt/dynamo/runtime/_PythonTorchTensorRTModule.py @@ -173,6 +173,8 @@ def __init__( self.cudagraph: Optional[torch.cuda.CUDAGraph] = None self._caller_stream: Optional[torch.cuda.Stream] = None self._engine_stream: Optional[torch.cuda.Stream] = None + self.output_tensors: Optional[List[torch.Tensor]] = None + self.sync_stream = True # TODO: Make the below a Dictionary {shape: cudagraph} self.shape_key: Optional[str] = None @@ -263,12 +265,16 @@ def setup_engine(self) -> None: assert ( self.target_platform == Platform.current_platform() ), f"TensorRT engine was not built to target current platform (target: {self.target_platform}, current: {Platform.current_platform()})" + # Stream handling: if the caller stream is the pytorch default stream, create a new engine stream + # otherwise, use the caller stream and disable stream synchronization self._caller_stream = torch.cuda.current_stream() - if ( - self._engine_stream == torch.cuda.default_stream() - or self._engine_stream is None - ): + if self._caller_stream == torch.cuda.default_stream(): self._engine_stream = torch.cuda.Stream() + self.sync_stream = True + else: + self._engine_stream = self._caller_stream + self.sync_stream = False + self.initialized = True runtime = trt.Runtime(TRT_LOGGER) self.engine = runtime.deserialize_cuda_engine(self.serialized_engine) @@ -489,15 +495,14 @@ def run_standard_execution() -> torch.Tensor | Tuple[torch.Tensor, ...]: if can_use_pre_allocated_outputs: outputs = self.pre_allocated_outputs else: - # self.output_shapes = [ - # tuple(self.context.get_tensor_shape(output_name)) - # for output_name in self.output_names - # ] + if DYNAMIC_DIM in self.output_shapes: raise ValueError( "Encountered dynamic output shapes during runtime. This could mean the network has data-dependent output shapes which is not currently supported." ) - outputs = self.create_output_tensors() + if self.output_tensors is None: + self.output_tensors = self.create_output_tensors() + outputs = self.output_tensors for o, output_name in enumerate(self.output_names): if need_cudagraphs_record: @@ -520,37 +525,38 @@ def run_standard_execution() -> torch.Tensor | Tuple[torch.Tensor, ...]: else nullcontext() ): - self._engine_stream.wait_stream(self._caller_stream) + if self.sync_stream: + self._engine_stream.wait_stream(self._caller_stream) - # with torch.cuda.stream(self._engine_stream): - # if self.cudagraphs_enabled: - # if need_cudagraphs_record: - # self.cudagraph = torch.cuda.CUDAGraph() + if self.cudagraphs_enabled: + if need_cudagraphs_record: + self.cudagraph = torch.cuda.CUDAGraph() - # if self.profiling_enabled: - # self.cudagraph.enable_debug_mode() + if self.profiling_enabled: + self.cudagraph.enable_debug_mode() - # with torch.cuda.graph( - # self.cudagraph, stream=self._engine_stream - # ): - # self.context.execute_async_v3( - # self._engine_stream.cuda_stream - # ) + with torch.cuda.graph( + self.cudagraph, stream=self._engine_stream + ): + self.context.execute_async_v3( + self._engine_stream.cuda_stream + ) - # if self.profiling_enabled: - # import tempfile + if self.profiling_enabled: + import tempfile - # with tempfile.TemporaryDirectory() as tmpdir: - # self.cudagraph.debug_dump( - # f"{tempdir}/{self.name}_cudagraph.dot" - # ) + with tempfile.TemporaryDirectory() as tmpdir: + self.cudagraph.debug_dump( + f"{tmpdir}/{self.name}_cudagraph.dot" + ) - # self.cudagraph.replay() # type: ignore + self.cudagraph.replay() # type: ignore - # else: - self.context.execute_async_v3(self._engine_stream.cuda_stream) + else: + self.context.execute_async_v3(self._engine_stream.cuda_stream) - self._caller_stream.wait_stream(self._engine_stream) + if self.sync_stream: + self._caller_stream.wait_stream(self._engine_stream) if self.use_pre_allocated_outputs: self.pre_allocated_outputs = self.create_output_tensors() @@ -753,13 +759,13 @@ def validate_input_shapes(self, inputs: Sequence[torch.Tensor]) -> bool: # Representation of input shapes to a given model # Shapes are concatenated as so: # x: (3, 4), y: (4, 5) --> Key: (3,4)(4,5) - tensor_inputs = [] - for t in inputs: - if not isinstance(t, torch.Tensor): - return True - tensor_inputs.append(t) + if not all(isinstance(t, torch.Tensor) for t in inputs): + return True + new_shape_key = "".join( - str(tuple(t.shape)).replace(" ", "") for t in tensor_inputs + str(tuple(t.shape)).replace(" ", "") + for t in inputs + if isinstance(t, torch.Tensor) ) # If the new shape key differs from the existing one, From 5fb0bebbb9fa0e9dee7430381e5a16dc46639afe Mon Sep 17 00:00:00 2001 From: cehongwang Date: Wed, 6 Aug 2025 20:59:55 +0000 Subject: [PATCH 3/5] Closed the graph break overhead in python --- py/torch_tensorrt/dynamo/_compiler.py | 5 ++++- .../dynamo/runtime/_PythonTorchTensorRTModule.py | 12 ++++++++++-- 2 files changed, 14 insertions(+), 3 deletions(-) diff --git a/py/torch_tensorrt/dynamo/_compiler.py b/py/torch_tensorrt/dynamo/_compiler.py index 74cab980c4..5f6f02d2b1 100644 --- a/py/torch_tensorrt/dynamo/_compiler.py +++ b/py/torch_tensorrt/dynamo/_compiler.py @@ -873,7 +873,7 @@ def preserve_module_specs( trt_modules = {} # Iterate over all components that can be accelerated # Generate the corresponding TRT Module for those - + trt_module = None for name, _ in partitioned_module.named_children(): submodule = getattr(partitioned_module, name) # filter on the GraphModule @@ -994,6 +994,9 @@ def preserve_module_specs( ) as f: f.write(trt_module.get_layer_info()) + if trt_module and settings.use_python_runtime: + trt_module.set_requires_unique_output(True) + # Parse the graph I/O and store it in dryrun tracker parse_graph_io(gm, dryrun_tracker) diff --git a/py/torch_tensorrt/dynamo/runtime/_PythonTorchTensorRTModule.py b/py/torch_tensorrt/dynamo/runtime/_PythonTorchTensorRTModule.py index 5fc1742a1b..c501af58be 100644 --- a/py/torch_tensorrt/dynamo/runtime/_PythonTorchTensorRTModule.py +++ b/py/torch_tensorrt/dynamo/runtime/_PythonTorchTensorRTModule.py @@ -221,8 +221,16 @@ def __init__( self.use_output_allocator_outputs = False self.device = torch.cuda.current_device() self.cudagraphs_enabled = torch_tensorrt.runtime.get_cudagraphs_mode() + self.requires_unique_output = False if self.serialized_engine is not None and not self.settings.lazy_engine_init: self.setup_engine() + self.is_shape_inference_io = [ + self.engine.is_shape_inference_io(input_name) + for input_name in self.input_names + ] + + def set_requires_unique_output(self, requires_unique_output: bool) -> None: + self.requires_unique_output = requires_unique_output def get_streamable_device_memory_budget(self) -> Any: return self.engine.streamable_weights_size @@ -396,7 +404,7 @@ def setup_input_tensors( # For shape tensors, we use CPU pointers and for data tensors, we use GPU pointers # as per TensorRT requirements - if self.engine.is_shape_inference_io(input_name): + if self.is_shape_inference_io[i]: # Shape tensor inputs are casted to int64 explicitly # Currently Torch CPU pointers are not working; numpy pointers are used instead # to refer to underlying memory @@ -500,7 +508,7 @@ def run_standard_execution() -> torch.Tensor | Tuple[torch.Tensor, ...]: raise ValueError( "Encountered dynamic output shapes during runtime. This could mean the network has data-dependent output shapes which is not currently supported." ) - if self.output_tensors is None: + if self.output_tensors is None or self.requires_unique_output: self.output_tensors = self.create_output_tensors() outputs = self.output_tensors From 0046f66f1dd97de24ec245a4a517a04a7bab392f Mon Sep 17 00:00:00 2001 From: cehongwang Date: Wed, 6 Aug 2025 23:27:07 +0000 Subject: [PATCH 4/5] fixed a bug in dynamic shape --- .../dynamo/runtime/_PythonTorchTensorRTModule.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/py/torch_tensorrt/dynamo/runtime/_PythonTorchTensorRTModule.py b/py/torch_tensorrt/dynamo/runtime/_PythonTorchTensorRTModule.py index c501af58be..f87e278fa7 100644 --- a/py/torch_tensorrt/dynamo/runtime/_PythonTorchTensorRTModule.py +++ b/py/torch_tensorrt/dynamo/runtime/_PythonTorchTensorRTModule.py @@ -503,7 +503,11 @@ def run_standard_execution() -> torch.Tensor | Tuple[torch.Tensor, ...]: if can_use_pre_allocated_outputs: outputs = self.pre_allocated_outputs else: - + if shape_changed: + self.output_shapes = [ + tuple(self.context.get_tensor_shape(output_name)) + for output_name in self.output_names + ] if DYNAMIC_DIM in self.output_shapes: raise ValueError( "Encountered dynamic output shapes during runtime. This could mean the network has data-dependent output shapes which is not currently supported." From 7259443f81586a22bc9e030b56e66ecd6b8ee07b Mon Sep 17 00:00:00 2001 From: cehongwang Date: Thu, 7 Aug 2025 20:30:25 +0000 Subject: [PATCH 5/5] Added some comments and an edge case --- py/torch_tensorrt/dynamo/_compiler.py | 1 + .../runtime/_PythonTorchTensorRTModule.py | 20 ++++++++++++++----- 2 files changed, 16 insertions(+), 5 deletions(-) diff --git a/py/torch_tensorrt/dynamo/_compiler.py b/py/torch_tensorrt/dynamo/_compiler.py index 5f6f02d2b1..df239eeea2 100644 --- a/py/torch_tensorrt/dynamo/_compiler.py +++ b/py/torch_tensorrt/dynamo/_compiler.py @@ -994,6 +994,7 @@ def preserve_module_specs( ) as f: f.write(trt_module.get_layer_info()) + # Only set the requires_unique_output flag for the last TRT Module when user has access to the output tensor if trt_module and settings.use_python_runtime: trt_module.set_requires_unique_output(True) diff --git a/py/torch_tensorrt/dynamo/runtime/_PythonTorchTensorRTModule.py b/py/torch_tensorrt/dynamo/runtime/_PythonTorchTensorRTModule.py index f87e278fa7..995835af94 100644 --- a/py/torch_tensorrt/dynamo/runtime/_PythonTorchTensorRTModule.py +++ b/py/torch_tensorrt/dynamo/runtime/_PythonTorchTensorRTModule.py @@ -378,6 +378,7 @@ def setup_input_tensors( contiguous_inputs: List[torch.Tensor], cudagraphs_enabled: bool, need_cudagraphs_record: bool, + shape_changed: bool = True, ) -> None: for i, input_name in enumerate(self.input_names): if not contiguous_inputs[i].is_cuda: @@ -411,9 +412,10 @@ def setup_input_tensors( inputs_cpu = contiguous_inputs[i].cpu().to(torch.int64).numpy().copy() self.context.set_tensor_address(input_name, inputs_cpu.ctypes.data) else: - self.context.set_input_shape( - input_name, tuple(contiguous_inputs[i].shape) - ) + if shape_changed: + self.context.set_input_shape( + input_name, tuple(contiguous_inputs[i].shape) + ) if cudagraphs_enabled: self._input_buffers[i].copy_(contiguous_inputs[i]) self.context.set_tensor_address( @@ -481,7 +483,11 @@ def run_standard_execution() -> torch.Tensor | Tuple[torch.Tensor, ...]: ), f"Wrong number of inputs, expect {len(self.input_names)} get {len(contiguous_inputs)}." self.setup_input_tensors( - contiguous_inputs, self.cudagraphs_enabled, need_cudagraphs_record + contiguous_inputs, + self.cudagraphs_enabled, + need_cudagraphs_record, + shape_changed + or self.output_tensors is None, # First time execution ) if shape_changed: @@ -512,7 +518,11 @@ def run_standard_execution() -> torch.Tensor | Tuple[torch.Tensor, ...]: raise ValueError( "Encountered dynamic output shapes during runtime. This could mean the network has data-dependent output shapes which is not currently supported." ) - if self.output_tensors is None or self.requires_unique_output: + if ( + self.output_tensors is None + or self.requires_unique_output + or shape_changed + ): self.output_tensors = self.create_output_tensors() outputs = self.output_tensors