Skip to content

Commit f261435

Browse files
authored
feat: Runtime output buffer optimization (#3276)
1 parent 7767594 commit f261435

File tree

13 files changed

+591
-166
lines changed

13 files changed

+591
-166
lines changed

core/runtime/TRTEngine.cpp

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -99,6 +99,9 @@ TRTEngine::TRTEngine(
9999
exec_ctx = make_trt(cuda_engine->createExecutionContext());
100100
TORCHTRT_CHECK((exec_ctx.get() != nullptr), "Unable to create TensorRT execution context");
101101

102+
runtime_states.old_cudagraphs = CUDAGRAPHS_MODE;
103+
runtime_states.old_pre_allocated_outputs = false;
104+
102105
if (_in_binding_names.size() == 0 && _out_binding_names.size() == 0) {
103106
uint64_t inputs = 0;
104107
uint64_t outputs = 0;

core/runtime/TRTEngine.h

Lines changed: 32 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,33 @@ using FlattenedState = std::tuple<
3030
std::tuple<std::string, std::string>, // serialized metadata
3131
std::tuple<std::string, std::string>>; // Platform
3232

33+
struct TorchTRTRuntimeStates {
34+
// Indicates whether CUDAGraphs were enabled in the previous execute_engine
35+
bool old_cudagraphs;
36+
// Indicates whether pre-allocated output was enabled in the previous execute_engine
37+
bool old_pre_allocated_outputs;
38+
39+
// Evaluates whether certain conditions are met to enable CUDA Graph recording or to reuse pre-allocated outputs
40+
// based on the current and previous states, as well as input shape has changed
41+
std::tuple<bool, bool> set_runtime_states(bool new_cudagraphs, bool new_pre_allocated_output, bool shape_changed) {
42+
bool need_cudagraphs_record = false;
43+
bool can_use_pre_allocated_outputs = false;
44+
45+
// Cudagraphs record is required if cudagraphs_enabled is switched to True regardless of shape change
46+
if (new_cudagraphs && (!old_cudagraphs || shape_changed)) {
47+
need_cudagraphs_record = true;
48+
}
49+
// Pre-allocated output can be used when previous and current state are true without shape change
50+
if (old_pre_allocated_outputs && new_pre_allocated_output && !shape_changed) {
51+
can_use_pre_allocated_outputs = true;
52+
}
53+
old_cudagraphs = new_cudagraphs;
54+
old_pre_allocated_outputs = new_pre_allocated_output;
55+
56+
return {need_cudagraphs_record, can_use_pre_allocated_outputs};
57+
}
58+
};
59+
3360
struct TRTEngine : torch::CustomClassHolder {
3461
// Each engine needs it's own runtime object
3562
std::shared_ptr<nvinfer1::IRuntime> rt;
@@ -88,6 +115,8 @@ struct TRTEngine : torch::CustomClassHolder {
88115
int64_t get_streamable_device_memory_budget();
89116
int64_t get_automatic_device_memory_budget();
90117
std::vector<at::Tensor> infer_outputs(std::vector<std::vector<int64_t>> input_shapes);
118+
void set_pre_allocated_outputs(bool enable);
119+
TorchTRTRuntimeStates runtime_states;
91120
friend std::ostream& operator<<(std::ostream& os, const TRTEngine& engine);
92121
static const char BINDING_DELIM = '%';
93122

@@ -101,7 +130,9 @@ struct TRTEngine : torch::CustomClassHolder {
101130
at::cuda::CUDAStream caller_stream = c10::cuda::getDefaultCUDAStream();
102131
std::vector<at::Tensor> input_buffers = {};
103132
std::vector<at::Tensor> output_buffers = {};
104-
std::string shape_key;
133+
std::string shape_key = "None";
134+
bool use_pre_allocated_outputs = false;
135+
std::vector<at::Tensor> pre_allocated_outputs;
105136

106137
// TODO: Implement a call method
107138
// c10::List<at::Tensor> Run(c10::List<at::Tensor> inputs);

core/runtime/execute_engine.cpp

Lines changed: 113 additions & 85 deletions
Original file line numberDiff line numberDiff line change
@@ -60,9 +60,8 @@ RTDevice select_rt_device(const RTDevice& engine_device, const RTDevice& curr_de
6060
return new_target_device_opt.value();
6161
}
6262

63-
bool _cudagraphs_validate_shapes(std::vector<at::Tensor> inputs, c10::intrusive_ptr<TRTEngine> compiled_engine) {
64-
// Validate whether the current input shapes to the engine
65-
// invalidate the existing cudagraphs object
63+
bool _validate_shapes(std::vector<at::Tensor> inputs, c10::intrusive_ptr<TRTEngine> compiled_engine) {
64+
// Validate whether the current input shapes to the engine has changed
6665

6766
// Populate the shape key for the inputs
6867
// x: (3, 4), y: (4, 5) --> Key: (3,4)(4,5)
@@ -83,15 +82,102 @@ bool _cudagraphs_validate_shapes(std::vector<at::Tensor> inputs, c10::intrusive_
8382

8483
auto new_shape_key = new_shape_key_ss.str();
8584

86-
// Compare the shape key to the original key and invalidate shapes if they do not match
85+
// Compare the shape key to the original key
8786
if (new_shape_key != compiled_engine->shape_key) {
88-
LOG_DEBUG("Resetting Cudagraph on New Shape Key " << new_shape_key);
87+
LOG_DEBUG("Input shape changed " << compiled_engine->shape_key << " -> " << new_shape_key);
8988
compiled_engine->shape_key = new_shape_key;
90-
compiled_engine->cudagraph.reset();
91-
return false;
89+
return true;
90+
}
91+
92+
return false;
93+
}
94+
void setup_input_tensors(
95+
std::vector<at::Tensor> inputs,
96+
c10::intrusive_ptr<TRTEngine> compiled_engine,
97+
bool need_cudagraphs_record) {
98+
// this is a buffer to store shape tensor input addresses throughout the runtime scope
99+
std::list<std::vector<int64_t>> inputShapeTensorValues;
100+
std::list<at::Tensor> formatted_inputs(compiled_engine->num_io.first);
101+
102+
for (size_t i = 0; i < inputs.size(); i++) {
103+
std::string name = compiled_engine->in_binding_names[i];
104+
105+
TORCHTRT_CHECK(
106+
inputs[i].is_cuda(), "Expected input tensors to have device cuda, found device " << inputs[i].device());
107+
108+
auto expected_type =
109+
util::TRTDataTypeToScalarType(compiled_engine->exec_ctx->getEngine().getTensorDataType(name.c_str()));
110+
TORCHTRT_CHECK(
111+
inputs[i].dtype() == expected_type,
112+
"Expected input tensors to have type " << expected_type << ", found type " << inputs[i].dtype());
113+
114+
auto dims = core::util::toDims(inputs[i].sizes());
115+
auto shape = core::util::toVec(dims);
116+
LOG_DEBUG("Input Name: " << name << " Shape: " << dims);
117+
118+
if (compiled_engine->cuda_engine->isShapeInferenceIO(name.c_str())) {
119+
// Shape tensor inputs are casted to int64 explicitly.
120+
// Refer to
121+
// https://github.com/NVIDIA/TensorRT/blob/d2f4ef789a9a6ffdf37b55c3f81b486225f6b380/samples/common/sampleInference.cpp#L435
122+
auto input_cpu = inputs[i].clone().contiguous().cpu().to(torch::kInt64);
123+
std::vector<int64_t> inputs_cpu_vec(
124+
input_cpu.data_ptr<int64_t>(), input_cpu.data_ptr<int64_t>() + input_cpu.numel());
125+
inputShapeTensorValues.emplace_back(inputs_cpu_vec);
126+
TORCHTRT_CHECK(
127+
compiled_engine->exec_ctx->setTensorAddress(name.c_str(), inputShapeTensorValues.back().data()),
128+
"Error while setting the tensor address for shape inputs");
129+
130+
if (CUDAGRAPHS_MODE) {
131+
// @peri044 I dont know if this makes sense since they are supposed to be GPU buffers
132+
compiled_engine->input_buffers[i] = input_cpu;
133+
}
134+
TORCHTRT_CHECK(
135+
compiled_engine->exec_ctx->setTensorAddress(name.c_str(), inputShapeTensorValues.back().data()),
136+
"Error while setting the tensor address for shape inputs");
137+
138+
} else {
139+
at::Tensor contig_input = inputs[i].view(shape).contiguous();
140+
formatted_inputs.emplace_back(std::move(contig_input));
141+
142+
if (need_cudagraphs_record) {
143+
// Create a new persistent input buffer
144+
compiled_engine->input_buffers[i] = std::move(formatted_inputs.back().clone());
145+
}
146+
147+
TORCHTRT_CHECK(
148+
compiled_engine->exec_ctx->setInputShape(name.c_str(), dims), "Error while setting the input shape");
149+
150+
if (CUDAGRAPHS_MODE) {
151+
// If using CUDAGraphs copy formatted input to the corresponding persistent input buffer
152+
compiled_engine->input_buffers[i].copy_(formatted_inputs.back(), true);
153+
TORCHTRT_CHECK(
154+
compiled_engine->exec_ctx->setTensorAddress(name.c_str(), compiled_engine->input_buffers[i].data_ptr()),
155+
"Error while setting the input tensor address for inputs");
156+
} else {
157+
// Otherwise use the formatted buffer directly
158+
TORCHTRT_CHECK(
159+
compiled_engine->exec_ctx->setTensorAddress(name.c_str(), formatted_inputs.back().data_ptr()),
160+
"Error while setting the input tensor address for inputs");
161+
}
162+
}
163+
}
164+
}
165+
std::vector<at::Tensor> create_output_tensors(c10::intrusive_ptr<TRTEngine> compiled_engine) {
166+
std::vector<at::Tensor> outputs(compiled_engine->num_io.second);
167+
for (auto output_indices : compiled_engine->out_binding_map) {
168+
// out_binding_map stores TRT_IDX: PYT_IDX
169+
auto pyt_idx = output_indices.second;
170+
171+
std::string name = compiled_engine->out_binding_names[pyt_idx];
172+
auto out_shape = compiled_engine->exec_ctx->getTensorShape(name.c_str());
173+
LOG_DEBUG("Output Name: " << name << " Shape: " << out_shape);
174+
175+
auto dims = core::util::toVec(out_shape);
176+
auto type = util::TRTDataTypeToScalarType(compiled_engine->exec_ctx->getEngine().getTensorDataType(name.c_str()));
177+
outputs[pyt_idx] = std::move(at::empty(dims, {at::kCUDA}).to(type).contiguous());
92178
}
93179

94-
return true;
180+
return outputs;
95181
}
96182

97183
std::vector<at::Tensor> execute_engine(std::vector<at::Tensor> inputs, c10::intrusive_ptr<TRTEngine> compiled_engine) {
@@ -116,18 +202,20 @@ std::vector<at::Tensor> execute_engine(std::vector<at::Tensor> inputs, c10::intr
116202
compiled_engine->cudagraph.enable_debug_mode();
117203
}
118204

205+
bool shape_changed = _validate_shapes(inputs, compiled_engine);
206+
119207
// Whether cudagraphs needs to record the graph on this pass
120-
bool need_cudagraphs_record = (CUDAGRAPHS_MODE && (!_cudagraphs_validate_shapes(inputs, compiled_engine)));
208+
auto result = compiled_engine->runtime_states.set_runtime_states(
209+
CUDAGRAPHS_MODE, compiled_engine->use_pre_allocated_outputs, shape_changed);
121210

122-
if (!CUDAGRAPHS_MODE) {
211+
bool need_cudagraphs_record = std::get<0>(result);
212+
bool can_use_pre_allocated_outputs = std::get<1>(result);
213+
214+
if (!CUDAGRAPHS_MODE || shape_changed) {
123215
compiled_engine->cudagraph.reset();
124216
}
125217

126-
// this is a buffer to store shape tensor input addresses throughout the runtime scope
127-
std::list<std::vector<int64_t>> inputShapeTensorValues;
128-
129218
// Intialize inputs and outputs to be available throughout the succeeding scopes
130-
std::list<at::Tensor> formatted_inputs(compiled_engine->num_io.first);
131219
std::vector<at::Tensor> outputs(compiled_engine->num_io.second);
132220

133221
if (MULTI_DEVICE_SAFE_MODE) {
@@ -185,68 +273,7 @@ std::vector<at::Tensor> execute_engine(std::vector<at::Tensor> inputs, c10::intr
185273
std::make_unique<torch::autograd::profiler::RecordProfile>(compiled_engine->input_profile_path);
186274
}
187275

188-
for (size_t i = 0; i < inputs.size(); i++) {
189-
std::string name = compiled_engine->in_binding_names[i];
190-
191-
TORCHTRT_CHECK(
192-
inputs[i].is_cuda(), "Expected input tensors to have device cuda, found device " << inputs[i].device());
193-
194-
auto expected_type =
195-
util::TRTDataTypeToScalarType(compiled_engine->exec_ctx->getEngine().getTensorDataType(name.c_str()));
196-
TORCHTRT_CHECK(
197-
inputs[i].dtype() == expected_type,
198-
"Expected input tensors to have type " << expected_type << ", found type " << inputs[i].dtype());
199-
200-
auto dims = core::util::toDims(inputs[i].sizes());
201-
auto shape = core::util::toVec(dims);
202-
LOG_DEBUG("Input Name: " << name << " Shape: " << dims);
203-
204-
if (compiled_engine->cuda_engine->isShapeInferenceIO(name.c_str())) {
205-
// Shape tensor inputs are casted to int64 explicitly.
206-
// Refer to
207-
// https://github.com/NVIDIA/TensorRT/blob/d2f4ef789a9a6ffdf37b55c3f81b486225f6b380/samples/common/sampleInference.cpp#L435
208-
auto input_cpu = inputs[i].clone().contiguous().cpu().to(torch::kInt64);
209-
std::vector<int64_t> inputs_cpu_vec(
210-
input_cpu.data_ptr<int64_t>(), input_cpu.data_ptr<int64_t>() + input_cpu.numel());
211-
inputShapeTensorValues.emplace_back(inputs_cpu_vec);
212-
TORCHTRT_CHECK(
213-
compiled_engine->exec_ctx->setTensorAddress(name.c_str(), inputShapeTensorValues.back().data()),
214-
"Error while setting the tensor address for shape inputs");
215-
216-
if (CUDAGRAPHS_MODE) {
217-
// @peri044 I dont know if this makes sense since they are supposed to be GPU buffers
218-
compiled_engine->input_buffers[i] = input_cpu;
219-
}
220-
TORCHTRT_CHECK(
221-
compiled_engine->exec_ctx->setTensorAddress(name.c_str(), inputShapeTensorValues.back().data()),
222-
"Error while setting the tensor address for shape inputs");
223-
224-
} else {
225-
at::Tensor contig_input = inputs[i].view(shape).contiguous();
226-
formatted_inputs.emplace_back(std::move(contig_input));
227-
228-
if (need_cudagraphs_record) {
229-
// Create a new persistent input buffer
230-
compiled_engine->input_buffers[i] = std::move(formatted_inputs.back().clone());
231-
}
232-
233-
TORCHTRT_CHECK(
234-
compiled_engine->exec_ctx->setInputShape(name.c_str(), dims), "Error while setting the input shape");
235-
236-
if (CUDAGRAPHS_MODE) {
237-
// If using CUDAGraphs copy formatted input to the corresponding persistent input buffer
238-
compiled_engine->input_buffers[i].copy_(formatted_inputs.back(), true);
239-
TORCHTRT_CHECK(
240-
compiled_engine->exec_ctx->setTensorAddress(name.c_str(), compiled_engine->input_buffers[i].data_ptr()),
241-
"Error while setting the input tensor address for inputs");
242-
} else {
243-
// Otherwise use the formatted buffer directly
244-
TORCHTRT_CHECK(
245-
compiled_engine->exec_ctx->setTensorAddress(name.c_str(), formatted_inputs.back().data_ptr()),
246-
"Error while setting the input tensor address for inputs");
247-
}
248-
}
249-
}
276+
setup_input_tensors(inputs, compiled_engine, need_cudagraphs_record);
250277

251278
// Check if input shapes can be inferred.
252279
int32_t const io_size{compiled_engine->cuda_engine->getNbIOTensors()};
@@ -265,19 +292,15 @@ std::vector<at::Tensor> execute_engine(std::vector<at::Tensor> inputs, c10::intr
265292
output_profiler_guard =
266293
std::make_unique<torch::autograd::profiler::RecordProfile>(compiled_engine->output_profile_path);
267294
}
295+
if (can_use_pre_allocated_outputs) {
296+
outputs = compiled_engine->pre_allocated_outputs;
297+
} else {
298+
outputs = create_output_tensors(compiled_engine);
299+
}
268300

269301
for (auto output_indices : compiled_engine->out_binding_map) {
270-
// out_binding_map stores TRT_IDX: PYT_IDX
271302
auto pyt_idx = output_indices.second;
272-
273303
std::string name = compiled_engine->out_binding_names[pyt_idx];
274-
auto out_shape = compiled_engine->exec_ctx->getTensorShape(name.c_str());
275-
LOG_DEBUG("Output Name: " << name << " Shape: " << out_shape);
276-
277-
auto dims = core::util::toVec(out_shape);
278-
auto type = util::TRTDataTypeToScalarType(compiled_engine->exec_ctx->getEngine().getTensorDataType(name.c_str()));
279-
outputs[pyt_idx] = std::move(at::empty(dims, {at::kCUDA}).to(type).contiguous());
280-
281304
if (need_cudagraphs_record) {
282305
// If we are recording the cuda graph then we need to update the persistent output buffer
283306
compiled_engine->output_buffers[pyt_idx] = std::move(outputs[pyt_idx].clone());
@@ -344,6 +367,11 @@ std::vector<at::Tensor> execute_engine(std::vector<at::Tensor> inputs, c10::intr
344367
}
345368
} // End engine exeuction (resets to caller stream)
346369

370+
// Create output buffer for next execution of graph or trt context.
371+
if (compiled_engine->use_pre_allocated_outputs) {
372+
compiled_engine->pre_allocated_outputs = create_output_tensors(compiled_engine);
373+
}
374+
347375
// Block caller stream until engine execution is complete
348376
at::cuda::CUDAEvent trt_exec_complete;
349377
trt_exec_complete.record(compiled_engine->engine_stream);

core/runtime/register_jit_hooks.cpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -88,6 +88,7 @@ static auto TORCHTRT_UNUSED TRTEngineTSRegistrtion =
8888
.def("dump_engine_layer_info", &TRTEngine::dump_engine_layer_info)
8989
.def("get_engine_layer_info", &TRTEngine::get_engine_layer_info)
9090
.def("infer_outputs", &TRTEngine::infer_outputs)
91+
.def_readwrite("use_pre_allocated_outputs", &TRTEngine::use_pre_allocated_outputs)
9192
.def_property(
9293
"device_memory_budget",
9394
&TRTEngine::get_device_memory_budget,

docsrc/index.rst

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -67,6 +67,7 @@ Tutorials
6767
* :ref:`custom_kernel_plugins`
6868
* :ref:`mutable_torchtrt_module_example`
6969
* :ref:`weight_streaming_example`
70+
* :ref:`pre_allocated_output_example`
7071

7172
.. toctree::
7273
:caption: Tutorials
@@ -85,6 +86,7 @@ Tutorials
8586
tutorials/_rendered_examples/dynamo/auto_generate_converters
8687
tutorials/_rendered_examples/dynamo/mutable_torchtrt_module_example
8788
tutorials/_rendered_examples/dynamo/weight_streaming_example
89+
tutorials/_rendered_examples/dynamo/pre_allocated_output_example
8890

8991
Dynamo Frontend
9092
----------------

0 commit comments

Comments
 (0)