Skip to content

Tentatively eliminate graph break overhead #3741

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Draft
wants to merge 7 commits into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions core/runtime/TRTEngine.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -237,6 +237,9 @@ TRTEngine::TRTEngine(
out_binding_names[pyt_idx] = binding_name;
}
num_io = std::make_pair(inputs_size, outputs);

this->current_device_id = at::cuda::current_device();
this->stream = c10::cuda::getCurrentCUDAStream(this->current_device_id);
}

#ifndef NDEBUG
Expand Down
5 changes: 3 additions & 2 deletions core/runtime/TRTEngine.h
Original file line number Diff line number Diff line change
Expand Up @@ -169,13 +169,14 @@ struct TRTEngine : torch::CustomClassHolder {

// CUDAGraph-Related Functionality
at::cuda::CUDAGraph cudagraph = {};
at::cuda::CUDAStream engine_stream = c10::cuda::getDefaultCUDAStream();
at::cuda::CUDAStream caller_stream = c10::cuda::getDefaultCUDAStream();
at::cuda::CUDAStream stream = c10::cuda::getDefaultCUDAStream();
int64_t current_device_id = at::cuda::current_device();
std::vector<at::Tensor> input_buffers = {};
std::vector<at::Tensor> output_buffers = {};
std::string shape_key = "None";
bool use_pre_allocated_outputs = false;
std::vector<at::Tensor> pre_allocated_outputs;
std::vector<at::Tensor> allocated_outputs;

// Output Allocator-Related Functionality
bool requires_output_allocator = false; // engine requires output allocator
Expand Down
280 changes: 130 additions & 150 deletions core/runtime/execute_engine.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -104,8 +104,8 @@ void setup_input_tensors(
for (size_t i = 0; i < inputs.size(); i++) {
std::string name = compiled_engine->in_binding_names[i];

TORCHTRT_CHECK(
inputs[i].is_cuda(), "Expected input tensors to have device cuda, found device " << inputs[i].device());
// TORCHTRT_CHECK(
// inputs[i].is_cuda(), "Expected input tensors to have device cuda, found device " << inputs[i].device());

auto expected_type =
util::TRTDataTypeToScalarType(compiled_engine->exec_ctx->getEngine().getTensorDataType(name.c_str()));
Expand Down Expand Up @@ -202,30 +202,30 @@ void create_output_allocator(c10::intrusive_ptr<TRTEngine> compiled_engine) {

std::vector<at::Tensor> execute_engine(std::vector<at::Tensor> inputs, c10::intrusive_ptr<TRTEngine> compiled_engine) {
auto run_standard_execution = [&]() {
bool cudagraphs_enabled = (CUDAGRAPHS_MODE == SUBGRAPH_CUDAGRAPHS);
bool shape_changed = _validate_shapes(inputs, compiled_engine);
bool cudagraphs_enabled = false; //(CUDAGRAPHS_MODE == SUBGRAPH_CUDAGRAPHS);
bool shape_changed = false; //_validate_shapes(inputs, compiled_engine);

// Whether cudagraphs needs to record the graph on this pass
auto result = compiled_engine->runtime_states.set_runtime_states(
cudagraphs_enabled, compiled_engine->use_pre_allocated_outputs, shape_changed);

bool need_cudagraphs_record = std::get<0>(result);
bool need_cudagraphs_record = false; //std::get<0>(result);
bool can_use_pre_allocated_outputs = std::get<1>(result);
bool need_cudagraphs_reset = std::get<2>(result);

if (need_cudagraphs_reset) {
compiled_engine->cudagraph.reset();
}
// if (need_cudagraphs_reset) {
// compiled_engine->cudagraph.reset();
// }

std::vector<at::Tensor> outputs(compiled_engine->num_io.second);
std::vector<at::Tensor> outputs;

// Intialize inputs and outputs to be available throughout the succeeding scopes
{ // Input Setup
std::unique_ptr<torch::autograd::profiler::RecordProfile> input_profiler_guard;
if (compiled_engine->profile_execution) {
input_profiler_guard =
std::make_unique<torch::autograd::profiler::RecordProfile>(compiled_engine->input_profile_path);
}
// std::unique_ptr<torch::autograd::profiler::RecordProfile> input_profiler_guard;
// if (compiled_engine->profile_execution) {
// input_profiler_guard =
// std::make_unique<torch::autograd::profiler::RecordProfile>(compiled_engine->input_profile_path);
// }

setup_input_tensors(inputs, compiled_engine, cudagraphs_enabled, need_cudagraphs_record);
// Check if input shapes can be inferred.
Expand All @@ -240,72 +240,71 @@ std::vector<at::Tensor> execute_engine(std::vector<at::Tensor> inputs, c10::intr
}

{ // Output Setup
std::unique_ptr<torch::autograd::profiler::RecordProfile> output_profiler_guard;
if (compiled_engine->profile_execution) {
output_profiler_guard =
std::make_unique<torch::autograd::profiler::RecordProfile>(compiled_engine->output_profile_path);
}
bool new_outputs = false;
// std::unique_ptr<torch::autograd::profiler::RecordProfile> output_profiler_guard;
// if (compiled_engine->profile_execution) {
// output_profiler_guard =
// std::make_unique<torch::autograd::profiler::RecordProfile>(compiled_engine->output_profile_path);
// }
if (can_use_pre_allocated_outputs) {
outputs = compiled_engine->pre_allocated_outputs;
} else {
outputs = create_output_tensors(compiled_engine);
if (compiled_engine->allocated_outputs.size() == 0) {
compiled_engine->allocated_outputs = create_output_tensors(compiled_engine);
std::cout << "new_outputs" << std::endl;
new_outputs = true;
}
outputs = compiled_engine->allocated_outputs;
}

for (auto output_indices : compiled_engine->out_binding_map) {
auto pyt_idx = output_indices.second;
std::string name = compiled_engine->out_binding_names[pyt_idx];
if (need_cudagraphs_record) {
// If we are recording the cuda graph then we need to update the persistent output buffer
compiled_engine->output_buffers[pyt_idx] = std::move(outputs[pyt_idx].clone());
}
if (new_outputs) {
for (auto output_indices : compiled_engine->out_binding_map) {
auto pyt_idx = output_indices.second;
std::string name = compiled_engine->out_binding_names[pyt_idx];
if (need_cudagraphs_record) {
// If we are recording the cuda graph then we need to update the persistent output buffer
compiled_engine->output_buffers[pyt_idx] = std::move(outputs[pyt_idx].clone());
}

if (cudagraphs_enabled) {
TORCHTRT_CHECK(
compiled_engine->exec_ctx->setTensorAddress(
name.c_str(), compiled_engine->output_buffers[pyt_idx].data_ptr()),
"Error while setting the output tensor address");
} else {
TORCHTRT_CHECK(
compiled_engine->exec_ctx->setTensorAddress(name.c_str(), outputs[pyt_idx].data_ptr()),
"Error while setting the output tensor address");
if (cudagraphs_enabled) {
TORCHTRT_CHECK(
compiled_engine->exec_ctx->setTensorAddress(
name.c_str(), compiled_engine->output_buffers[pyt_idx].data_ptr()),
"Error while setting the output tensor address");
} else {
TORCHTRT_CHECK(
compiled_engine->exec_ctx->setTensorAddress(name.c_str(), outputs[pyt_idx].data_ptr()),
"Error while setting the output tensor address");
}
}
}
}

auto current_device_id = -1;
if (inputs.size() > 0) {
current_device_id = inputs[0].device().index(); // Done this way to avoid a call to cudart
} else if (outputs.size() > 0) {
current_device_id = outputs[0].device().index(); // Done this way to avoid a call to cudart
}

compiled_engine->caller_stream = c10::cuda::getCurrentCUDAStream(current_device_id);
if (compiled_engine->engine_stream == c10::cuda::getDefaultCUDAStream(current_device_id)) {
// Create a new stream if the engine stream is the default stream
compiled_engine->engine_stream = c10::cuda::getStreamFromPool(false, current_device_id);
}
// auto current_device_id = -1;
// if (inputs.size() > 0) {
// current_device_id = inputs[0].device().index(); // Done this way to avoid a call to cudart
// if (current_device_id != compiled_engine->current_device_id) {
// compiled_engine->stream = c10::cuda::getCurrentCUDAStream(current_device_id);
// }
// }

{ // Engine Execution (execute on engine stream)
c10::cuda::CUDAStreamGuard stream_guard(compiled_engine->engine_stream);

std::unique_ptr<torch::autograd::profiler::RecordProfile> enqueue_profiler_guard;
if (compiled_engine->profile_execution) {
enqueue_profiler_guard =
std::make_unique<torch::autograd::profiler::RecordProfile>(compiled_engine->enqueue_profile_path);
}
// std::unique_ptr<torch::autograd::profiler::RecordProfile> enqueue_profiler_guard;
// if (compiled_engine->profile_execution) {
// enqueue_profiler_guard =
// std::make_unique<torch::autograd::profiler::RecordProfile>(compiled_engine->enqueue_profile_path);
// }


// Block engine stream until results are available on caller stream
at::cuda::CUDAEvent caller_exec_complete;
caller_exec_complete.record(compiled_engine->caller_stream);
caller_exec_complete.block(compiled_engine->engine_stream);

if (!cudagraphs_enabled) {
// Direct execution uses the caller buffers directly
compiled_engine->exec_ctx->enqueueV3(compiled_engine->engine_stream);
compiled_engine->exec_ctx->enqueueV3(compiled_engine->stream);
} else {
if (need_cudagraphs_record) {
// If cudagraphs needs to record a graph, capture the enqueueV3 call in a graph
c10::cuda::CUDAStream recording_stream = compiled_engine->engine_stream;
c10::cuda::CUDAStream recording_stream = compiled_engine->stream;
compiled_engine->cudagraph.capture_begin();
compiled_engine->exec_ctx->enqueueV3(recording_stream);
compiled_engine->cudagraph.capture_end();
Expand All @@ -321,27 +320,22 @@ std::vector<at::Tensor> execute_engine(std::vector<at::Tensor> inputs, c10::intr
} // End engine exeuction (resets to caller stream)

// Create output buffer for next execution of graph or trt context.
if (compiled_engine->use_pre_allocated_outputs) {
compiled_engine->pre_allocated_outputs = create_output_tensors(compiled_engine);
}

// Block caller stream until engine execution is complete
at::cuda::CUDAEvent trt_exec_complete;
trt_exec_complete.record(compiled_engine->engine_stream);
trt_exec_complete.block(compiled_engine->caller_stream);

if (cudagraphs_enabled) {
// If in CUDAGraph mode, results need to be copied to the result buffers (on caller stream)
for (size_t o = 0; o < compiled_engine->output_buffers.size(); o++) {
outputs[o].copy_(compiled_engine->output_buffers[o], false);
}
}

if (compiled_engine->profile_execution) {
LOG_INFO(std::endl << *compiled_engine->trt_engine_profiler);
dump_trace(compiled_engine->trt_engine_profile_path, *compiled_engine->trt_engine_profiler);
compiled_engine->dump_engine_layer_info();
}
// if (compiled_engine->use_pre_allocated_outputs) {
// compiled_engine->pre_allocated_outputs = create_output_tensors(compiled_engine);
// }

// if (cudagraphs_enabled) {
// // If in CUDAGraph mode, results need to be copied to the result buffers (on caller stream)
// for (size_t o = 0; o < compiled_engine->output_buffers.size(); o++) {
// outputs[o].copy_(compiled_engine->output_buffers[o], false);
// }
// }

// if (compiled_engine->profile_execution) {
// LOG_INFO(std::endl << *compiled_engine->trt_engine_profiler);
// dump_trace(compiled_engine->trt_engine_profile_path, *compiled_engine->trt_engine_profiler);
// compiled_engine->dump_engine_layer_info();
// }

return outputs;
};
Expand Down Expand Up @@ -378,45 +372,31 @@ std::vector<at::Tensor> execute_engine(std::vector<at::Tensor> inputs, c10::intr
auto current_device_id = -1;
if (inputs.size() > 0) {
current_device_id = inputs[0].device().index(); // Done this way to avoid a call to cudart
} else {
current_device_id = at::cuda::current_device();
}
if (current_device_id != compiled_engine->current_device_id) {
compiled_engine->stream = c10::cuda::getCurrentCUDAStream(current_device_id);

}
}

compiled_engine->caller_stream = c10::cuda::getCurrentCUDAStream(current_device_id);
if (compiled_engine->engine_stream == c10::cuda::getDefaultCUDAStream(current_device_id)) {
// Create a new stream if the engine stream is the default stream
compiled_engine->engine_stream = c10::cuda::getStreamFromPool(false, current_device_id);
}

{ // Engine Execution (execute on engine stream)
c10::cuda::CUDAStreamGuard stream_guard(compiled_engine->engine_stream);

std::unique_ptr<torch::autograd::profiler::RecordProfile> enqueue_profiler_guard;
if (compiled_engine->profile_execution) {
enqueue_profiler_guard =
std::make_unique<torch::autograd::profiler::RecordProfile>(compiled_engine->enqueue_profile_path);
}

// Block engine stream until results are available on caller stream
at::cuda::CUDAEvent caller_exec_complete;
caller_exec_complete.record(compiled_engine->caller_stream);
caller_exec_complete.block(compiled_engine->engine_stream);
// std::unique_ptr<torch::autograd::profiler::RecordProfile> enqueue_profiler_guard;
// if (compiled_engine->profile_execution) {
// enqueue_profiler_guard =
// std::make_unique<torch::autograd::profiler::RecordProfile>(compiled_engine->enqueue_profile_path);
// }

// Direct execution uses the caller buffers directly
compiled_engine->exec_ctx->enqueueV3(compiled_engine->engine_stream);
compiled_engine->exec_ctx->enqueueV3(compiled_engine->stream);

} // End engine exeuction (resets to caller stream)

// Block caller stream until engine execution is complete
at::cuda::CUDAEvent trt_exec_complete;
trt_exec_complete.record(compiled_engine->engine_stream);
trt_exec_complete.block(compiled_engine->caller_stream);

std::unique_ptr<torch::autograd::profiler::RecordProfile> output_profiler_guard;
if (compiled_engine->profile_execution) {
output_profiler_guard =
std::make_unique<torch::autograd::profiler::RecordProfile>(compiled_engine->output_profile_path);
}
// std::unique_ptr<torch::autograd::profiler::RecordProfile> output_profiler_guard;
// if (compiled_engine->profile_execution) {
// output_profiler_guard =
// std::make_unique<torch::autograd::profiler::RecordProfile>(compiled_engine->output_profile_path);
// }
std::vector<at::Tensor> outputs;
for (size_t i = 0; i < compiled_engine->out_binding_names.size(); i++) {
auto name = compiled_engine->out_binding_names[i];
Expand Down Expand Up @@ -476,45 +456,45 @@ std::vector<at::Tensor> execute_engine(std::vector<at::Tensor> inputs, c10::intr
std::make_unique<torch::autograd::profiler::RecordProfile>(compiled_engine->device_profile_path);
}

RTDevice curr_device = get_current_device();
LOG_DEBUG("Current Device: " << curr_device);

// Generic Target Device Prefix
std::string target_device = "cuda:";

if (is_switch_required(curr_device, compiled_engine->device_info)) {
// Scan through available CUDA devices and set the CUDA device context correctly
RTDevice device =
select_rt_device(compiled_engine->device_info, curr_device, compiled_engine->hardware_compatible);
set_rt_device(device);

// Target device is new device
target_device += std::to_string(device.id);

for (auto& in : inputs) {
in = in.to(torch::Device(target_device));
}
} else {
// Target device is current device
target_device += std::to_string(curr_device.id);
}

// For each input, ensure its current device is the desired target device
for (size_t i = 0; i < inputs.size(); i++) {
at::Tensor* in = &inputs[i];
std::string current_tensor_device = in->device().str();

// If current device string does not match target device, display warning and move tensor accordingly
if (current_tensor_device != target_device) {
LOG_WARNING(
"Input " << i << " of engine " << compiled_engine->name << " was found to be on " << current_tensor_device
<< " but should be on " << target_device << ". This tensor is being moved by the runtime but "
<< "for performance considerations, ensure your inputs are all on GPU "
<< "and open an issue here (https://github.com/pytorch/TensorRT/issues) if this "
<< "warning persists.");
*in = in->to(torch::Device(target_device));
}
}
// RTDevice curr_device = get_current_device();
// LOG_DEBUG("Current Device: " << curr_device);

// // Generic Target Device Prefix
// std::string target_device = "cuda:";

// if (is_switch_required(curr_device, compiled_engine->device_info)) {
// // Scan through available CUDA devices and set the CUDA device context correctly
// RTDevice device =
// select_rt_device(compiled_engine->device_info, curr_device, compiled_engine->hardware_compatible);
// set_rt_device(device);

// // Target device is new device
// target_device += std::to_string(device.id);

// for (auto& in : inputs) {
// in = in.to(torch::Device(target_device));
// }
// } else {
// // Target device is current device
// target_device += std::to_string(curr_device.id);
// }

// // For each input, ensure its current device is the desired target device
// for (size_t i = 0; i < inputs.size(); i++) {
// at::Tensor* in = &inputs[i];
// std::string current_tensor_device = in->device().str();

// // If current device string does not match target device, display warning and move tensor accordingly
// if (current_tensor_device != target_device) {
// LOG_WARNING(
// "Input " << i << " of engine " << compiled_engine->name << " was found to be on " << current_tensor_device
// << " but should be on " << target_device << ". This tensor is being moved by the runtime but "
// << "for performance considerations, ensure your inputs are all on GPU "
// << "and open an issue here (https://github.com/pytorch/TensorRT/issues) if this "
// << "warning persists.");
// *in = in->to(torch::Device(target_device));
// }
// }
}

if (compiled_engine->requires_output_allocator) { // engine requires OA
Expand Down
Loading