Skip to content

Commit a9a27b1

Browse files
committed
Added cpp optimization still slower than python runtime
1 parent a537d9f commit a9a27b1

File tree

8 files changed

+98
-85
lines changed

8 files changed

+98
-85
lines changed

core/runtime/TRTEngine.cpp

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -237,6 +237,14 @@ TRTEngine::TRTEngine(
237237
out_binding_names[pyt_idx] = binding_name;
238238
}
239239
num_io = std::make_pair(inputs_size, outputs);
240+
241+
this->current_device_id = at::cuda::current_device();
242+
this->stream = c10::cuda::getCurrentCUDAStream(this->current_device_id);
243+
this->io_size = this->cuda_engine->getNbIOTensors();
244+
for (int64_t i = 0; i < this->in_binding_names.size(); i++) {
245+
this->isShapeInferenceIO[this->in_binding_names[i]] =
246+
this->cuda_engine->isShapeInferenceIO(this->in_binding_names[i].c_str());
247+
}
240248
}
241249

242250
#ifndef NDEBUG
@@ -281,6 +289,14 @@ void TRTEngine::enable_profiling() {
281289
exec_ctx->setProfiler(trt_engine_profiler.get());
282290
}
283291

292+
void TRTEngine::set_requires_new_output_tensor(bool enable) {
293+
this->requires_new_output_tensor = enable;
294+
}
295+
296+
bool TRTEngine::get_requires_new_output_tensor() {
297+
return this->requires_new_output_tensor;
298+
}
299+
284300
void TRTEngine::set_profile_format(std::string format) {
285301
if (format == "trex") {
286302
this->trt_engine_profiler->set_profile_format(TraceFormat::kTREX);

core/runtime/TRTEngine.h

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -103,6 +103,9 @@ struct TRTEngine : torch::CustomClassHolder {
103103
std::shared_ptr<nvinfer1::ICudaEngine> cuda_engine;
104104
std::shared_ptr<nvinfer1::IExecutionContext> exec_ctx;
105105
std::pair<uint64_t, uint64_t> num_io;
106+
uint64_t io_size;
107+
std::map<std::string, bool> isShapeInferenceIO;
108+
bool requires_new_output_tensor = false;
106109
std::string name;
107110
RTDevice device_info;
108111

@@ -159,6 +162,8 @@ struct TRTEngine : torch::CustomClassHolder {
159162
int64_t get_automatic_device_memory_budget();
160163
std::vector<at::Tensor> infer_outputs(std::vector<std::vector<int64_t>> input_shapes);
161164
void set_pre_allocated_outputs(bool enable);
165+
void set_requires_new_output_tensor(bool enable);
166+
bool get_requires_new_output_tensor();
162167
TorchTRTRuntimeStates runtime_states;
163168
friend std::ostream& operator<<(std::ostream& os, const TRTEngine& engine);
164169
static const char BINDING_DELIM = '%';
@@ -169,13 +174,14 @@ struct TRTEngine : torch::CustomClassHolder {
169174

170175
// CUDAGraph-Related Functionality
171176
at::cuda::CUDAGraph cudagraph = {};
172-
at::cuda::CUDAStream engine_stream = c10::cuda::getDefaultCUDAStream();
173-
at::cuda::CUDAStream caller_stream = c10::cuda::getDefaultCUDAStream();
177+
at::cuda::CUDAStream stream = c10::cuda::getDefaultCUDAStream();
178+
int64_t current_device_id = at::cuda::current_device();
174179
std::vector<at::Tensor> input_buffers = {};
175180
std::vector<at::Tensor> output_buffers = {};
176181
std::string shape_key = "None";
177182
bool use_pre_allocated_outputs = false;
178183
std::vector<at::Tensor> pre_allocated_outputs;
184+
std::vector<at::Tensor> allocated_outputs;
179185

180186
// Output Allocator-Related Functionality
181187
bool requires_output_allocator = false; // engine requires output allocator

core/runtime/execute_engine.cpp

Lines changed: 45 additions & 69 deletions
Original file line numberDiff line numberDiff line change
@@ -96,7 +96,8 @@ void setup_input_tensors(
9696
std::vector<at::Tensor> inputs,
9797
c10::intrusive_ptr<TRTEngine> compiled_engine,
9898
bool cudagraphs_enabled,
99-
bool need_cudagraphs_record) {
99+
bool need_cudagraphs_record,
100+
bool shape_changed) {
100101
// this is a buffer to store shape tensor input addresses throughout the runtime scope
101102
std::list<std::vector<int64_t>> inputShapeTensorValues;
102103
std::list<at::Tensor> formatted_inputs(compiled_engine->num_io.first);
@@ -117,7 +118,7 @@ void setup_input_tensors(
117118
auto shape = core::util::toVec(dims);
118119
LOG_DEBUG("Input Name: " << name << " Shape: " << dims);
119120

120-
if (compiled_engine->cuda_engine->isShapeInferenceIO(name.c_str())) {
121+
if (compiled_engine->isShapeInferenceIO[name]) {
121122
// Shape tensor inputs are casted to int64 explicitly.
122123
// Refer to
123124
// https://github.com/NVIDIA/TensorRT/blob/d2f4ef789a9a6ffdf37b55c3f81b486225f6b380/samples/common/sampleInference.cpp#L435
@@ -145,10 +146,10 @@ void setup_input_tensors(
145146
// Create a new persistent input buffer
146147
compiled_engine->input_buffers[i] = std::move(formatted_inputs.back().clone());
147148
}
148-
149-
TORCHTRT_CHECK(
150-
compiled_engine->exec_ctx->setInputShape(name.c_str(), dims), "Error while setting the input shape");
151-
149+
if (shape_changed) {
150+
TORCHTRT_CHECK(
151+
compiled_engine->exec_ctx->setInputShape(name.c_str(), dims), "Error while setting the input shape");
152+
}
152153
if (cudagraphs_enabled) {
153154
// If using CUDAGraphs copy formatted input to the corresponding persistent input buffer
154155
compiled_engine->input_buffers[i].copy_(formatted_inputs.back(), true);
@@ -217,7 +218,7 @@ std::vector<at::Tensor> execute_engine(std::vector<at::Tensor> inputs, c10::intr
217218
compiled_engine->cudagraph.reset();
218219
}
219220

220-
std::vector<at::Tensor> outputs(compiled_engine->num_io.second);
221+
std::vector<at::Tensor> outputs;
221222

222223
// Intialize inputs and outputs to be available throughout the succeeding scopes
223224
{ // Input Setup
@@ -226,10 +227,9 @@ std::vector<at::Tensor> execute_engine(std::vector<at::Tensor> inputs, c10::intr
226227
input_profiler_guard =
227228
std::make_unique<torch::autograd::profiler::RecordProfile>(compiled_engine->input_profile_path);
228229
}
229-
230-
setup_input_tensors(inputs, compiled_engine, cudagraphs_enabled, need_cudagraphs_record);
230+
setup_input_tensors(inputs, compiled_engine, cudagraphs_enabled, need_cudagraphs_record, shape_changed);
231231
// Check if input shapes can be inferred.
232-
int32_t const io_size{compiled_engine->cuda_engine->getNbIOTensors()};
232+
int32_t const io_size{compiled_engine->io_size};
233233
std::vector<char const*> names(io_size);
234234
int32_t const nbNames = compiled_engine->exec_ctx->inferShapes(names.size(), names.data());
235235
TORCHTRT_CHECK(
@@ -240,6 +240,7 @@ std::vector<at::Tensor> execute_engine(std::vector<at::Tensor> inputs, c10::intr
240240
}
241241

242242
{ // Output Setup
243+
bool new_outputs = false;
243244
std::unique_ptr<torch::autograd::profiler::RecordProfile> output_profiler_guard;
244245
if (compiled_engine->profile_execution) {
245246
output_profiler_guard =
@@ -248,64 +249,60 @@ std::vector<at::Tensor> execute_engine(std::vector<at::Tensor> inputs, c10::intr
248249
if (can_use_pre_allocated_outputs) {
249250
outputs = compiled_engine->pre_allocated_outputs;
250251
} else {
251-
outputs = create_output_tensors(compiled_engine);
252+
if (compiled_engine->allocated_outputs.size() == 0 or compiled_engine->requires_new_output_tensor or
253+
shape_changed) {
254+
compiled_engine->allocated_outputs = create_output_tensors(compiled_engine);
255+
new_outputs = true;
256+
}
257+
outputs = compiled_engine->allocated_outputs;
252258
}
253259

254-
for (auto output_indices : compiled_engine->out_binding_map) {
255-
auto pyt_idx = output_indices.second;
256-
std::string name = compiled_engine->out_binding_names[pyt_idx];
257-
if (need_cudagraphs_record) {
258-
// If we are recording the cuda graph then we need to update the persistent output buffer
259-
compiled_engine->output_buffers[pyt_idx] = std::move(outputs[pyt_idx].clone());
260-
}
260+
if (new_outputs) {
261+
for (auto output_indices : compiled_engine->out_binding_map) {
262+
auto pyt_idx = output_indices.second;
263+
std::string name = compiled_engine->out_binding_names[pyt_idx];
264+
if (need_cudagraphs_record) {
265+
// If we are recording the cuda graph then we need to update the persistent output buffer
266+
compiled_engine->output_buffers[pyt_idx] = std::move(outputs[pyt_idx].clone());
267+
}
261268

262-
if (cudagraphs_enabled) {
263-
TORCHTRT_CHECK(
264-
compiled_engine->exec_ctx->setTensorAddress(
265-
name.c_str(), compiled_engine->output_buffers[pyt_idx].data_ptr()),
266-
"Error while setting the output tensor address");
267-
} else {
268-
TORCHTRT_CHECK(
269-
compiled_engine->exec_ctx->setTensorAddress(name.c_str(), outputs[pyt_idx].data_ptr()),
270-
"Error while setting the output tensor address");
269+
if (cudagraphs_enabled) {
270+
TORCHTRT_CHECK(
271+
compiled_engine->exec_ctx->setTensorAddress(
272+
name.c_str(), compiled_engine->output_buffers[pyt_idx].data_ptr()),
273+
"Error while setting the output tensor address");
274+
} else {
275+
TORCHTRT_CHECK(
276+
compiled_engine->exec_ctx->setTensorAddress(name.c_str(), outputs[pyt_idx].data_ptr()),
277+
"Error while setting the output tensor address");
278+
}
271279
}
272280
}
273281
}
274282

275283
auto current_device_id = -1;
276284
if (inputs.size() > 0) {
277285
current_device_id = inputs[0].device().index(); // Done this way to avoid a call to cudart
278-
} else if (outputs.size() > 0) {
279-
current_device_id = outputs[0].device().index(); // Done this way to avoid a call to cudart
280-
}
281-
282-
compiled_engine->caller_stream = c10::cuda::getCurrentCUDAStream(current_device_id);
283-
if (compiled_engine->engine_stream == c10::cuda::getDefaultCUDAStream(current_device_id)) {
284-
// Create a new stream if the engine stream is the default stream
285-
compiled_engine->engine_stream = c10::cuda::getStreamFromPool(false, current_device_id);
286+
if (current_device_id != compiled_engine->current_device_id) {
287+
compiled_engine->stream = c10::cuda::getCurrentCUDAStream(current_device_id);
288+
}
286289
}
287290

288291
{ // Engine Execution (execute on engine stream)
289-
c10::cuda::CUDAStreamGuard stream_guard(compiled_engine->engine_stream);
290292

291293
std::unique_ptr<torch::autograd::profiler::RecordProfile> enqueue_profiler_guard;
292294
if (compiled_engine->profile_execution) {
293295
enqueue_profiler_guard =
294296
std::make_unique<torch::autograd::profiler::RecordProfile>(compiled_engine->enqueue_profile_path);
295297
}
296298

297-
// Block engine stream until results are available on caller stream
298-
at::cuda::CUDAEvent caller_exec_complete;
299-
caller_exec_complete.record(compiled_engine->caller_stream);
300-
caller_exec_complete.block(compiled_engine->engine_stream);
301-
302299
if (!cudagraphs_enabled) {
303300
// Direct execution uses the caller buffers directly
304-
compiled_engine->exec_ctx->enqueueV3(compiled_engine->engine_stream);
301+
compiled_engine->exec_ctx->enqueueV3(compiled_engine->stream);
305302
} else {
306303
if (need_cudagraphs_record) {
307304
// If cudagraphs needs to record a graph, capture the enqueueV3 call in a graph
308-
c10::cuda::CUDAStream recording_stream = compiled_engine->engine_stream;
305+
c10::cuda::CUDAStream recording_stream = compiled_engine->stream;
309306
compiled_engine->cudagraph.capture_begin();
310307
compiled_engine->exec_ctx->enqueueV3(recording_stream);
311308
compiled_engine->cudagraph.capture_end();
@@ -325,11 +322,6 @@ std::vector<at::Tensor> execute_engine(std::vector<at::Tensor> inputs, c10::intr
325322
compiled_engine->pre_allocated_outputs = create_output_tensors(compiled_engine);
326323
}
327324

328-
// Block caller stream until engine execution is complete
329-
at::cuda::CUDAEvent trt_exec_complete;
330-
trt_exec_complete.record(compiled_engine->engine_stream);
331-
trt_exec_complete.block(compiled_engine->caller_stream);
332-
333325
if (cudagraphs_enabled) {
334326
// If in CUDAGraph mode, results need to be copied to the result buffers (on caller stream)
335327
for (size_t o = 0; o < compiled_engine->output_buffers.size(); o++) {
@@ -354,7 +346,7 @@ std::vector<at::Tensor> execute_engine(std::vector<at::Tensor> inputs, c10::intr
354346
std::make_unique<torch::autograd::profiler::RecordProfile>(compiled_engine->input_profile_path);
355347
}
356348

357-
setup_input_tensors(inputs, compiled_engine, false, false);
349+
setup_input_tensors(inputs, compiled_engine, false, false, true);
358350
// Check if input shapes can be inferred.
359351
int32_t const io_size{compiled_engine->cuda_engine->getNbIOTensors()};
360352
std::vector<char const*> names(io_size);
@@ -378,40 +370,24 @@ std::vector<at::Tensor> execute_engine(std::vector<at::Tensor> inputs, c10::intr
378370
auto current_device_id = -1;
379371
if (inputs.size() > 0) {
380372
current_device_id = inputs[0].device().index(); // Done this way to avoid a call to cudart
381-
} else {
382-
current_device_id = at::cuda::current_device();
383-
}
384-
385-
compiled_engine->caller_stream = c10::cuda::getCurrentCUDAStream(current_device_id);
386-
if (compiled_engine->engine_stream == c10::cuda::getDefaultCUDAStream(current_device_id)) {
387-
// Create a new stream if the engine stream is the default stream
388-
compiled_engine->engine_stream = c10::cuda::getStreamFromPool(false, current_device_id);
373+
if (current_device_id != compiled_engine->current_device_id) {
374+
compiled_engine->stream = c10::cuda::getCurrentCUDAStream(current_device_id);
375+
}
389376
}
390377

391378
{ // Engine Execution (execute on engine stream)
392-
c10::cuda::CUDAStreamGuard stream_guard(compiled_engine->engine_stream);
393379

394380
std::unique_ptr<torch::autograd::profiler::RecordProfile> enqueue_profiler_guard;
395381
if (compiled_engine->profile_execution) {
396382
enqueue_profiler_guard =
397383
std::make_unique<torch::autograd::profiler::RecordProfile>(compiled_engine->enqueue_profile_path);
398384
}
399385

400-
// Block engine stream until results are available on caller stream
401-
at::cuda::CUDAEvent caller_exec_complete;
402-
caller_exec_complete.record(compiled_engine->caller_stream);
403-
caller_exec_complete.block(compiled_engine->engine_stream);
404-
405386
// Direct execution uses the caller buffers directly
406-
compiled_engine->exec_ctx->enqueueV3(compiled_engine->engine_stream);
387+
compiled_engine->exec_ctx->enqueueV3(compiled_engine->stream);
407388

408389
} // End engine exeuction (resets to caller stream)
409390

410-
// Block caller stream until engine execution is complete
411-
at::cuda::CUDAEvent trt_exec_complete;
412-
trt_exec_complete.record(compiled_engine->engine_stream);
413-
trt_exec_complete.block(compiled_engine->caller_stream);
414-
415391
std::unique_ptr<torch::autograd::profiler::RecordProfile> output_profiler_guard;
416392
if (compiled_engine->profile_execution) {
417393
output_profiler_guard =

core/runtime/register_jit_hooks.cpp

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -90,6 +90,8 @@ static auto TORCHTRT_UNUSED TRTEngineTSRegistrtion =
9090
.def("get_engine_layer_info", &TRTEngine::get_engine_layer_info)
9191
.def("infer_outputs", &TRTEngine::infer_outputs)
9292
.def("reset_captured_graph", &TRTEngine::reset_captured_graph)
93+
.def("set_requires_new_output_tensor", &TRTEngine::set_requires_new_output_tensor)
94+
.def("get_requires_new_output_tensor", &TRTEngine::get_requires_new_output_tensor)
9395
.def_readwrite("use_pre_allocated_outputs", &TRTEngine::use_pre_allocated_outputs)
9496
.def_readwrite("use_output_allocator_outputs", &TRTEngine::use_output_allocator_outputs)
9597
.def_property(

py/torch_tensorrt/dynamo/_compiler.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -995,8 +995,8 @@ def preserve_module_specs(
995995
f.write(trt_module.get_layer_info())
996996

997997
# Only set the requires_unique_output flag for the last TRT Module when user has access to the output tensor
998-
if trt_module and settings.use_python_runtime:
999-
trt_module.set_requires_unique_output(True)
998+
if trt_module:
999+
trt_module.set_requires_new_output_tensor(True)
10001000

10011001
# Parse the graph I/O and store it in dryrun tracker
10021002
parse_graph_io(gm, dryrun_tracker)

py/torch_tensorrt/dynamo/runtime/_PythonTorchTensorRTModule.py

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -221,16 +221,16 @@ def __init__(
221221
self.use_output_allocator_outputs = False
222222
self.device = torch.cuda.current_device()
223223
self.cudagraphs_enabled = torch_tensorrt.runtime.get_cudagraphs_mode()
224-
self.requires_unique_output = False
224+
self.requires_new_output_tensor = False
225225
if self.serialized_engine is not None and not self.settings.lazy_engine_init:
226226
self.setup_engine()
227-
self.is_shape_inference_io = [
228-
self.engine.is_shape_inference_io(input_name)
227+
self.is_shape_inference_io = {
228+
input_name: self.engine.is_shape_inference_io(input_name)
229229
for input_name in self.input_names
230-
]
230+
}
231231

232-
def set_requires_unique_output(self, requires_unique_output: bool) -> None:
233-
self.requires_unique_output = requires_unique_output
232+
def set_requires_new_output_tensor(self, enabled: bool) -> None:
233+
self.requires_new_output_tensor = enabled
234234

235235
def get_streamable_device_memory_budget(self) -> Any:
236236
return self.engine.streamable_weights_size
@@ -405,7 +405,7 @@ def setup_input_tensors(
405405

406406
# For shape tensors, we use CPU pointers and for data tensors, we use GPU pointers
407407
# as per TensorRT requirements
408-
if self.is_shape_inference_io[i]:
408+
if self.is_shape_inference_io[input_name]:
409409
# Shape tensor inputs are casted to int64 explicitly
410410
# Currently Torch CPU pointers are not working; numpy pointers are used instead
411411
# to refer to underlying memory
@@ -520,7 +520,7 @@ def run_standard_execution() -> torch.Tensor | Tuple[torch.Tensor, ...]:
520520
)
521521
if (
522522
self.output_tensors is None
523-
or self.requires_unique_output
523+
or self.requires_new_output_tensor
524524
or shape_changed
525525
):
526526
self.output_tensors = self.create_output_tensors()

py/torch_tensorrt/dynamo/runtime/_TorchTensorRTModule.py

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -156,6 +156,11 @@ def _pack_engine_info(self) -> List[str | bytes]:
156156
metadata = {
157157
"settings": self.settings,
158158
"weight_name_map": self.weight_name_map,
159+
"requires_new_output_tensor": (
160+
False
161+
if self.engine is None
162+
else self.engine.get_requires_new_output_tensor()
163+
),
159164
}
160165
target_platform = (
161166
Platform.current_platform()
@@ -284,6 +289,8 @@ def set_extra_state(self, state: SerializedTorchTensorRTModuleFmt) -> None:
284289
metadata = TorchTensorRTModule.decode_metadata(serialized_metadata)
285290
self.settings = metadata["settings"]
286291
self.weight_name_map = metadata["weight_name_map"]
292+
self.requires_new_output_tensor = metadata["requires_new_output_tensor"]
293+
self.engine.set_requires_new_output_tensor(self.requires_new_output_tensor)
287294

288295
else:
289296
self.engine = None
@@ -355,6 +362,12 @@ def enable_profiling(
355362
self.engine.enable_profiling()
356363
self.engine.set_profile_format(profile_format)
357364

365+
def set_requires_new_output_tensor(self, enabled: bool) -> None:
366+
self.engine.set_requires_new_output_tensor(enabled)
367+
368+
def get_requires_new_output_tensor(self) -> bool:
369+
return self.engine.get_requires_new_output_tensor()
370+
358371
def disable_profiling(self) -> None:
359372
"""Disable the profiler"""
360373
if self.engine is None:

setup.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -192,10 +192,10 @@ def build_libtorchtrt_cxx11_abi(
192192
else:
193193
cmd.append("//:libtorchtrt")
194194

195-
if develop:
196-
cmd.append("--compilation_mode=dbg")
197-
else:
198-
cmd.append("--compilation_mode=opt")
195+
# if develop:
196+
# cmd.append("--compilation_mode=dbg")
197+
# else:
198+
cmd.append("--compilation_mode=opt")
199199
if use_dist_dir:
200200
if IS_AARCH64:
201201
cmd.append("--distdir=third_party/dist_dir/aarch64-linux-gnu")

0 commit comments

Comments
 (0)