@@ -60,9 +60,8 @@ RTDevice select_rt_device(const RTDevice& engine_device, const RTDevice& curr_de
60
60
return new_target_device_opt.value ();
61
61
}
62
62
63
- bool _cudagraphs_validate_shapes (std::vector<at::Tensor> inputs, c10::intrusive_ptr<TRTEngine> compiled_engine) {
64
- // Validate whether the current input shapes to the engine
65
- // invalidate the existing cudagraphs object
63
+ bool _validate_shapes (std::vector<at::Tensor> inputs, c10::intrusive_ptr<TRTEngine> compiled_engine) {
64
+ // Validate whether the current input shapes to the engine has changed
66
65
67
66
// Populate the shape key for the inputs
68
67
// x: (3, 4), y: (4, 5) --> Key: (3,4)(4,5)
@@ -83,15 +82,102 @@ bool _cudagraphs_validate_shapes(std::vector<at::Tensor> inputs, c10::intrusive_
83
82
84
83
auto new_shape_key = new_shape_key_ss.str ();
85
84
86
- // Compare the shape key to the original key and invalidate shapes if they do not match
85
+ // Compare the shape key to the original key
87
86
if (new_shape_key != compiled_engine->shape_key ) {
88
- LOG_DEBUG (" Resetting Cudagraph on New Shape Key " << new_shape_key);
87
+ LOG_DEBUG (" Input shape changed " << compiled_engine-> shape_key << " -> " << new_shape_key);
89
88
compiled_engine->shape_key = new_shape_key;
90
- compiled_engine->cudagraph .reset ();
91
- return false ;
89
+ return true ;
90
+ }
91
+
92
+ return false ;
93
+ }
94
+ void setup_input_tensors (
95
+ std::vector<at::Tensor> inputs,
96
+ c10::intrusive_ptr<TRTEngine> compiled_engine,
97
+ bool need_cudagraphs_record) {
98
+ // this is a buffer to store shape tensor input addresses throughout the runtime scope
99
+ std::list<std::vector<int64_t >> inputShapeTensorValues;
100
+ std::list<at::Tensor> formatted_inputs (compiled_engine->num_io .first );
101
+
102
+ for (size_t i = 0 ; i < inputs.size (); i++) {
103
+ std::string name = compiled_engine->in_binding_names [i];
104
+
105
+ TORCHTRT_CHECK (
106
+ inputs[i].is_cuda (), " Expected input tensors to have device cuda, found device " << inputs[i].device ());
107
+
108
+ auto expected_type =
109
+ util::TRTDataTypeToScalarType (compiled_engine->exec_ctx ->getEngine ().getTensorDataType (name.c_str ()));
110
+ TORCHTRT_CHECK (
111
+ inputs[i].dtype () == expected_type,
112
+ " Expected input tensors to have type " << expected_type << " , found type " << inputs[i].dtype ());
113
+
114
+ auto dims = core::util::toDims (inputs[i].sizes ());
115
+ auto shape = core::util::toVec (dims);
116
+ LOG_DEBUG (" Input Name: " << name << " Shape: " << dims);
117
+
118
+ if (compiled_engine->cuda_engine ->isShapeInferenceIO (name.c_str ())) {
119
+ // Shape tensor inputs are casted to int64 explicitly.
120
+ // Refer to
121
+ // https://github.com/NVIDIA/TensorRT/blob/d2f4ef789a9a6ffdf37b55c3f81b486225f6b380/samples/common/sampleInference.cpp#L435
122
+ auto input_cpu = inputs[i].clone ().contiguous ().cpu ().to (torch::kInt64 );
123
+ std::vector<int64_t > inputs_cpu_vec (
124
+ input_cpu.data_ptr <int64_t >(), input_cpu.data_ptr <int64_t >() + input_cpu.numel ());
125
+ inputShapeTensorValues.emplace_back (inputs_cpu_vec);
126
+ TORCHTRT_CHECK (
127
+ compiled_engine->exec_ctx ->setTensorAddress (name.c_str (), inputShapeTensorValues.back ().data ()),
128
+ " Error while setting the tensor address for shape inputs" );
129
+
130
+ if (CUDAGRAPHS_MODE) {
131
+ // @peri044 I dont know if this makes sense since they are supposed to be GPU buffers
132
+ compiled_engine->input_buffers [i] = input_cpu;
133
+ }
134
+ TORCHTRT_CHECK (
135
+ compiled_engine->exec_ctx ->setTensorAddress (name.c_str (), inputShapeTensorValues.back ().data ()),
136
+ " Error while setting the tensor address for shape inputs" );
137
+
138
+ } else {
139
+ at::Tensor contig_input = inputs[i].view (shape).contiguous ();
140
+ formatted_inputs.emplace_back (std::move (contig_input));
141
+
142
+ if (need_cudagraphs_record) {
143
+ // Create a new persistent input buffer
144
+ compiled_engine->input_buffers [i] = std::move (formatted_inputs.back ().clone ());
145
+ }
146
+
147
+ TORCHTRT_CHECK (
148
+ compiled_engine->exec_ctx ->setInputShape (name.c_str (), dims), " Error while setting the input shape" );
149
+
150
+ if (CUDAGRAPHS_MODE) {
151
+ // If using CUDAGraphs copy formatted input to the corresponding persistent input buffer
152
+ compiled_engine->input_buffers [i].copy_ (formatted_inputs.back (), true );
153
+ TORCHTRT_CHECK (
154
+ compiled_engine->exec_ctx ->setTensorAddress (name.c_str (), compiled_engine->input_buffers [i].data_ptr ()),
155
+ " Error while setting the input tensor address for inputs" );
156
+ } else {
157
+ // Otherwise use the formatted buffer directly
158
+ TORCHTRT_CHECK (
159
+ compiled_engine->exec_ctx ->setTensorAddress (name.c_str (), formatted_inputs.back ().data_ptr ()),
160
+ " Error while setting the input tensor address for inputs" );
161
+ }
162
+ }
163
+ }
164
+ }
165
+ std::vector<at::Tensor> create_output_tensors (c10::intrusive_ptr<TRTEngine> compiled_engine) {
166
+ std::vector<at::Tensor> outputs (compiled_engine->num_io .second );
167
+ for (auto output_indices : compiled_engine->out_binding_map ) {
168
+ // out_binding_map stores TRT_IDX: PYT_IDX
169
+ auto pyt_idx = output_indices.second ;
170
+
171
+ std::string name = compiled_engine->out_binding_names [pyt_idx];
172
+ auto out_shape = compiled_engine->exec_ctx ->getTensorShape (name.c_str ());
173
+ LOG_DEBUG (" Output Name: " << name << " Shape: " << out_shape);
174
+
175
+ auto dims = core::util::toVec (out_shape);
176
+ auto type = util::TRTDataTypeToScalarType (compiled_engine->exec_ctx ->getEngine ().getTensorDataType (name.c_str ()));
177
+ outputs[pyt_idx] = std::move (at::empty (dims, {at::kCUDA }).to (type).contiguous ());
92
178
}
93
179
94
- return true ;
180
+ return outputs ;
95
181
}
96
182
97
183
std::vector<at::Tensor> execute_engine (std::vector<at::Tensor> inputs, c10::intrusive_ptr<TRTEngine> compiled_engine) {
@@ -116,18 +202,20 @@ std::vector<at::Tensor> execute_engine(std::vector<at::Tensor> inputs, c10::intr
116
202
compiled_engine->cudagraph .enable_debug_mode ();
117
203
}
118
204
205
+ bool shape_changed = _validate_shapes (inputs, compiled_engine);
206
+
119
207
// Whether cudagraphs needs to record the graph on this pass
120
- bool need_cudagraphs_record = (CUDAGRAPHS_MODE && (!_cudagraphs_validate_shapes (inputs, compiled_engine)));
208
+ auto result = compiled_engine->runtime_states .set_runtime_states (
209
+ CUDAGRAPHS_MODE, compiled_engine->use_pre_allocated_outputs , shape_changed);
121
210
122
- if (!CUDAGRAPHS_MODE) {
211
+ bool need_cudagraphs_record = std::get<0 >(result);
212
+ bool can_use_pre_allocated_outputs = std::get<1 >(result);
213
+
214
+ if (!CUDAGRAPHS_MODE || shape_changed) {
123
215
compiled_engine->cudagraph .reset ();
124
216
}
125
217
126
- // this is a buffer to store shape tensor input addresses throughout the runtime scope
127
- std::list<std::vector<int64_t >> inputShapeTensorValues;
128
-
129
218
// Intialize inputs and outputs to be available throughout the succeeding scopes
130
- std::list<at::Tensor> formatted_inputs (compiled_engine->num_io .first );
131
219
std::vector<at::Tensor> outputs (compiled_engine->num_io .second );
132
220
133
221
if (MULTI_DEVICE_SAFE_MODE) {
@@ -185,68 +273,7 @@ std::vector<at::Tensor> execute_engine(std::vector<at::Tensor> inputs, c10::intr
185
273
std::make_unique<torch::autograd::profiler::RecordProfile>(compiled_engine->input_profile_path );
186
274
}
187
275
188
- for (size_t i = 0 ; i < inputs.size (); i++) {
189
- std::string name = compiled_engine->in_binding_names [i];
190
-
191
- TORCHTRT_CHECK (
192
- inputs[i].is_cuda (), " Expected input tensors to have device cuda, found device " << inputs[i].device ());
193
-
194
- auto expected_type =
195
- util::TRTDataTypeToScalarType (compiled_engine->exec_ctx ->getEngine ().getTensorDataType (name.c_str ()));
196
- TORCHTRT_CHECK (
197
- inputs[i].dtype () == expected_type,
198
- " Expected input tensors to have type " << expected_type << " , found type " << inputs[i].dtype ());
199
-
200
- auto dims = core::util::toDims (inputs[i].sizes ());
201
- auto shape = core::util::toVec (dims);
202
- LOG_DEBUG (" Input Name: " << name << " Shape: " << dims);
203
-
204
- if (compiled_engine->cuda_engine ->isShapeInferenceIO (name.c_str ())) {
205
- // Shape tensor inputs are casted to int64 explicitly.
206
- // Refer to
207
- // https://github.com/NVIDIA/TensorRT/blob/d2f4ef789a9a6ffdf37b55c3f81b486225f6b380/samples/common/sampleInference.cpp#L435
208
- auto input_cpu = inputs[i].clone ().contiguous ().cpu ().to (torch::kInt64 );
209
- std::vector<int64_t > inputs_cpu_vec (
210
- input_cpu.data_ptr <int64_t >(), input_cpu.data_ptr <int64_t >() + input_cpu.numel ());
211
- inputShapeTensorValues.emplace_back (inputs_cpu_vec);
212
- TORCHTRT_CHECK (
213
- compiled_engine->exec_ctx ->setTensorAddress (name.c_str (), inputShapeTensorValues.back ().data ()),
214
- " Error while setting the tensor address for shape inputs" );
215
-
216
- if (CUDAGRAPHS_MODE) {
217
- // @peri044 I dont know if this makes sense since they are supposed to be GPU buffers
218
- compiled_engine->input_buffers [i] = input_cpu;
219
- }
220
- TORCHTRT_CHECK (
221
- compiled_engine->exec_ctx ->setTensorAddress (name.c_str (), inputShapeTensorValues.back ().data ()),
222
- " Error while setting the tensor address for shape inputs" );
223
-
224
- } else {
225
- at::Tensor contig_input = inputs[i].view (shape).contiguous ();
226
- formatted_inputs.emplace_back (std::move (contig_input));
227
-
228
- if (need_cudagraphs_record) {
229
- // Create a new persistent input buffer
230
- compiled_engine->input_buffers [i] = std::move (formatted_inputs.back ().clone ());
231
- }
232
-
233
- TORCHTRT_CHECK (
234
- compiled_engine->exec_ctx ->setInputShape (name.c_str (), dims), " Error while setting the input shape" );
235
-
236
- if (CUDAGRAPHS_MODE) {
237
- // If using CUDAGraphs copy formatted input to the corresponding persistent input buffer
238
- compiled_engine->input_buffers [i].copy_ (formatted_inputs.back (), true );
239
- TORCHTRT_CHECK (
240
- compiled_engine->exec_ctx ->setTensorAddress (name.c_str (), compiled_engine->input_buffers [i].data_ptr ()),
241
- " Error while setting the input tensor address for inputs" );
242
- } else {
243
- // Otherwise use the formatted buffer directly
244
- TORCHTRT_CHECK (
245
- compiled_engine->exec_ctx ->setTensorAddress (name.c_str (), formatted_inputs.back ().data_ptr ()),
246
- " Error while setting the input tensor address for inputs" );
247
- }
248
- }
249
- }
276
+ setup_input_tensors (inputs, compiled_engine, need_cudagraphs_record);
250
277
251
278
// Check if input shapes can be inferred.
252
279
int32_t const io_size{compiled_engine->cuda_engine ->getNbIOTensors ()};
@@ -265,19 +292,15 @@ std::vector<at::Tensor> execute_engine(std::vector<at::Tensor> inputs, c10::intr
265
292
output_profiler_guard =
266
293
std::make_unique<torch::autograd::profiler::RecordProfile>(compiled_engine->output_profile_path );
267
294
}
295
+ if (can_use_pre_allocated_outputs) {
296
+ outputs = compiled_engine->pre_allocated_outputs ;
297
+ } else {
298
+ outputs = create_output_tensors (compiled_engine);
299
+ }
268
300
269
301
for (auto output_indices : compiled_engine->out_binding_map ) {
270
- // out_binding_map stores TRT_IDX: PYT_IDX
271
302
auto pyt_idx = output_indices.second ;
272
-
273
303
std::string name = compiled_engine->out_binding_names [pyt_idx];
274
- auto out_shape = compiled_engine->exec_ctx ->getTensorShape (name.c_str ());
275
- LOG_DEBUG (" Output Name: " << name << " Shape: " << out_shape);
276
-
277
- auto dims = core::util::toVec (out_shape);
278
- auto type = util::TRTDataTypeToScalarType (compiled_engine->exec_ctx ->getEngine ().getTensorDataType (name.c_str ()));
279
- outputs[pyt_idx] = std::move (at::empty (dims, {at::kCUDA }).to (type).contiguous ());
280
-
281
304
if (need_cudagraphs_record) {
282
305
// If we are recording the cuda graph then we need to update the persistent output buffer
283
306
compiled_engine->output_buffers [pyt_idx] = std::move (outputs[pyt_idx].clone ());
@@ -344,6 +367,11 @@ std::vector<at::Tensor> execute_engine(std::vector<at::Tensor> inputs, c10::intr
344
367
}
345
368
} // End engine exeuction (resets to caller stream)
346
369
370
+ // Create output buffer for next execution of graph or trt context.
371
+ if (compiled_engine->use_pre_allocated_outputs ) {
372
+ compiled_engine->pre_allocated_outputs = create_output_tensors (compiled_engine);
373
+ }
374
+
347
375
// Block caller stream until engine execution is complete
348
376
at::cuda::CUDAEvent trt_exec_complete;
349
377
trt_exec_complete.record (compiled_engine->engine_stream );
0 commit comments