From 642f74e4e22de5430b1be9dcdf6917b10acd45bf Mon Sep 17 00:00:00 2001 From: J Wyman Date: Mon, 27 Oct 2025 16:30:56 -0400 Subject: [PATCH 1/2] maintenance: Separate Code into Separate Files This change breaks the monolithic src/libtorch.cc into multiple files, with a modern separation of classes into separate header and code files. --- CMakeLists.txt | 3 + src/libtorch.cc | 2493 +---------------------------------- src/libtorch.hh | 59 + src/model_instance_state.cc | 1632 +++++++++++++++++++++++ src/model_instance_state.hh | 178 +++ src/model_state.cc | 495 +++++++ src/model_state.hh | 131 ++ src/naming_convention.hh | 40 + src/string_utilities.cc | 254 ++++ src/string_utilities.hh | 106 ++ 10 files changed, 2902 insertions(+), 2489 deletions(-) create mode 100644 src/libtorch.hh create mode 100644 src/model_instance_state.cc create mode 100644 src/model_instance_state.hh create mode 100644 src/model_state.cc create mode 100644 src/model_state.hh create mode 100644 src/naming_convention.hh create mode 100644 src/string_utilities.cc create mode 100644 src/string_utilities.hh diff --git a/CMakeLists.txt b/CMakeLists.txt index 3afe90b..227aa27 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -280,6 +280,9 @@ add_library( src/libtorch.cc src/libtorch_utils.cc src/libtorch_utils.h + src/model_instance_state.cc + src/model_state.cc + src/string_utilities.cc ) add_library( diff --git a/src/libtorch.cc b/src/libtorch.cc index c873375..500f1f5 100644 --- a/src/libtorch.cc +++ b/src/libtorch.cc @@ -24,2498 +24,13 @@ // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -#include - -#include -#include -#include - -#include "libtorch_utils.h" -#include "triton/backend/backend_common.h" -#include "triton/backend/backend_input_collector.h" -#include "triton/backend/backend_memory.h" -#include "triton/backend/backend_model.h" -#include "triton/backend/backend_model_instance.h" -#include "triton/backend/backend_output_responder.h" -#include "triton/common/nvtx.h" -#include "triton/core/tritonbackend.h" - -#ifdef TRITON_PYTORCH_ENABLE_TORCHVISION -// Suppress warnings in torch headers -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wsign-compare" -#pragma warning(push, 0) -#include -#include // Torchvision header -#pragma warning(pop) -#pragma GCC diagnostic pop -#endif // TRITON_PYTORCH_ENABLE_TORCHVISION - -#ifdef TRITON_ENABLE_GPU -#include -#include -#include -#endif // TRITON_ENABLE_GPU - -// for thread control -// https://pytorch.org/docs/stable/notes/cpu_threading_torchscript_inference.html#runtime-api -// https://github.com/pytorch/pytorch/blob/v2.2.1-rc3/aten/src/ATen/Parallel.h#L133 -#include - +#include "libtorch.hh" // // PyTorch C++ (LibTorch) Backend that implements the TRITONBACKEND API. // -namespace { -std::once_flag pytorch_interop_threads_flag; -std::once_flag pytorch_intraop_threads_flag; -} // namespace - -namespace triton { namespace backend { namespace pytorch { - -// -// ModelState -// -// State associated with a model that is using this backend. An object -// of this class is created and associated with each -// TRITONBACKEND_Model. -// -class ModelState : public BackendModel { - public: - static TRITONSERVER_Error* Create( - TRITONBACKEND_Model* triton_model, ModelState** state); - virtual ~ModelState() = default; - - // Load a TorchScript model using 'artifact_name' as the name for the - // TorchScript file. Return in 'model_path' the full path to the - // TorchScript file, return in 'torch_model' the Torch Module - // representing the model. - TRITONSERVER_Error* LoadModel( - const std::string& artifact_name, const torch::Device device, - std::string* model_path, const TRITONSERVER_InstanceGroupKind& kind, - std::shared_ptr* torch_model); - - bool EnabledOptimizedExecution() { return enable_optimized_execution_; } - const std::pair& EnabledTensorExprFuser() const - { - return enable_tensor_fuser_pair_; - } - const std::pair& EnabledJitProfiling() const - { - return enable_jit_profiling_pair_; - } - const std::pair& EnabledJitExecutor() const - { - return enable_jit_executor_pair_; - } - bool EnabledInferenceMode() { return enable_inference_mode_; } - bool EnabledCudnn() { return enable_cudnn_; } - bool EnabledCacheCleaning() { return enable_cache_cleaning_; } - - bool EnabledWeightSharing() { return enable_weight_sharing_; } - const std::map>& ModelOutputs() - { - return model_outputs_; - } - - private: - ModelState(TRITONBACKEND_Model* triton_model); - TRITONSERVER_Error* AutoCompleteConfig(); - - // Parses and validates parameters in config - TRITONSERVER_Error* ParseParameters(); - - // Flag to indicate whether optimized execution is enabled. Defaults to true. - bool enable_optimized_execution_; - - // Flag to indicate whether inference mode is enabled. Defaults to false. - bool enable_inference_mode_; - - // Flag to indicate whether cudnn is enabled. Defaults to true. - bool enable_cudnn_; - - // Flag to indicate whether cache cleaning after each run is enabled. - // Defaults to false. - bool enable_cache_cleaning_; - - // Flag to indicate whether weight sharing is enabled. Defaults to false. - bool enable_weight_sharing_; - - // Flag pairs to indicate if various JIT settings are set and - // enabled respectively. Defaults to (false, true). Default behavior - // is to do nothing if not explicitly set. - std::pair enable_tensor_fuser_pair_; - std::pair enable_jit_profiling_pair_; - std::pair enable_jit_executor_pair_; - - // Model mapping for shared TorchScript model across all instances on the - // same device. The key is a pair of isGPU and device index. - std::map< - std::pair, std::shared_ptr> - torch_models_; - - // model_outputs is a map that contains unique outputs that the model must - // provide. The first pair is the model output index and the second is - // the index in the model state, -1 is used if one is not required. - // In the model configuration, the output in the state configuration - // can have intersection with the outputs section of the model. If an output - // is specified both in the output section and state section, it indicates - // that the backend must return the output state to the client too. - std::map> model_outputs_; -}; - -TRITONSERVER_Error* -ModelState::Create(TRITONBACKEND_Model* triton_model, ModelState** state) -{ - try { - *state = new ModelState(triton_model); - } - catch (const BackendModelException& ex) { - RETURN_ERROR_IF_TRUE( - ex.err_ == nullptr, TRITONSERVER_ERROR_INTERNAL, - std::string("unexpected nullptr in BackendModelException")); - RETURN_IF_ERROR(ex.err_); - } - - // Auto-complete the configuration if requested... - bool auto_complete_config = false; - RETURN_IF_ERROR(TRITONBACKEND_ModelAutoCompleteConfig( - triton_model, &auto_complete_config)); - if (auto_complete_config) { - RETURN_IF_ERROR((*state)->AutoCompleteConfig()); - RETURN_IF_ERROR((*state)->SetModelConfig()); - } - - auto& model_outputs = (*state)->model_outputs_; - // Parse the output states in the model configuration - triton::common::TritonJson::Value sequence_batching; - if ((*state)->ModelConfig().Find("sequence_batching", &sequence_batching)) { - triton::common::TritonJson::Value states; - if (sequence_batching.Find("state", &states)) { - for (size_t i = 0; i < states.ArraySize(); i++) { - triton::common::TritonJson::Value state; - RETURN_IF_ERROR(states.IndexAsObject(i, &state)); - std::string output_state_name; - RETURN_IF_ERROR( - state.MemberAsString("output_name", &output_state_name)); - auto it = model_outputs.find(output_state_name); - if (it == model_outputs.end()) { - model_outputs.insert({output_state_name, std::make_pair(-1, i)}); - } else { - it->second.second = i; - } - } - } - } - - // Parse the output names in the model configuration - triton::common::TritonJson::Value outputs; - RETURN_IF_ERROR((*state)->ModelConfig().MemberAsArray("output", &outputs)); - for (size_t i = 0; i < outputs.ArraySize(); i++) { - triton::common::TritonJson::Value output; - THROW_IF_BACKEND_INSTANCE_ERROR(outputs.IndexAsObject(i, &output)); - - // Use names from ModelConfig by reference since the model - // config will persist longer than this inference execution. - std::string output_name; - THROW_IF_BACKEND_INSTANCE_ERROR( - output.MemberAsString("name", &output_name)); - - auto it = model_outputs.find(output_name); - if (it == model_outputs.end()) { - model_outputs.insert({output_name, std::make_pair(i, -1)}); - } else { - it->second.first = i; - } - } - - RETURN_IF_ERROR((*state)->ParseParameters()); - - return nullptr; // success -} - -ModelState::ModelState(TRITONBACKEND_Model* triton_model) - : BackendModel(triton_model), enable_optimized_execution_(true), - enable_inference_mode_(true), enable_cudnn_(true), - enable_cache_cleaning_(false), enable_weight_sharing_(false), - enable_tensor_fuser_pair_({false, true}), - enable_jit_profiling_pair_({false, true}), - enable_jit_executor_pair_({false, true}) -{ -} - -TRITONSERVER_Error* -ModelState::LoadModel( - const std::string& artifact_name, const torch::Device device, - std::string* model_path, const TRITONSERVER_InstanceGroupKind& kind, - std::shared_ptr* torch_model) -{ - // Find the TorchScript file that describes the model. If the model - // configuration doesn't have an explicit model file specified then - // use the default name ("model.pt"). - std::string cc_model_filename = artifact_name; - if (cc_model_filename.empty()) { - cc_model_filename = "model.pt"; - } - - *model_path = JoinPath( - {RepositoryPath(), std::to_string(Version()), cc_model_filename}); - - { - bool exists; - RETURN_IF_ERROR(FileExists(*model_path, &exists)); - RETURN_ERROR_IF_FALSE( - exists, TRITONSERVER_ERROR_UNAVAILABLE, - std::string("unable to find '") + *model_path + - "' for model instance '" + Name() + "'"); - } - - // If weight sharing is enabled, skip loading model if - // it is already available on the target device - std::pair device_pair; - if (enable_weight_sharing_) { - device_pair = std::make_pair(!device.is_cpu(), device.index()); - auto mit = torch_models_.find(device_pair); - if (mit != torch_models_.end()) { - *torch_model = mit->second; - LOG_MESSAGE( - TRITONSERVER_LOG_INFO, - (std::string("Reusing TorchScript model for instance '") + Name() + - "'") - .c_str()); - return nullptr; // success - } - } - - // Serialize the torch model to string - std::string model_data_str; - RETURN_IF_ERROR(ReadTextFile(*model_path, &model_data_str)); - - // InferenceMode should be used to guard all tensors operations including - // model loading: https://pytorch.org/cppdocs/notes/inference_mode.html - torch::InferenceMode infer_guard(EnabledInferenceMode()); - - try { - std::istringstream model_stream(model_data_str); - if (kind == TRITONSERVER_INSTANCEGROUPKIND_MODEL) { - // Load the model without selecting a device. - torch_model->reset( - new torch::jit::Module(torch::jit::load(model_stream))); - } else { - torch_model->reset( - new torch::jit::Module(torch::jit::load(model_stream, device))); - } - } - catch (const std::exception& ex) { - return TRITONSERVER_ErrorNew( - TRITONSERVER_ERROR_INTERNAL, - ("failed to load model '" + Name() + "': " + ex.what()).c_str()); - } - - if (enable_weight_sharing_) { - if (!((torch_models_.emplace(device_pair, *torch_model)).second)) { - std::string type = device.is_cpu() ? "CPU" : "GPU"; - LOG_MESSAGE( - TRITONSERVER_LOG_WARN, - (std::string("Model already found on target ") + type + " device " + - "(id " + std::to_string(device.index()) + ") for '" + Name() + "'") - .c_str()); - } - } - - return nullptr; // success -} - -TRITONSERVER_Error* -ModelState::AutoCompleteConfig() -{ - // Auto-complete configuration is not supported since PyTorch does not - // store/capture sufficient model metadata so just log error instead. - LOG_MESSAGE( - TRITONSERVER_LOG_WARN, - (std::string("skipping model configuration auto-complete for '") + - Name() + "': not supported for pytorch backend") - .c_str()); - - return nullptr; // success -} - -TRITONSERVER_Error* -ModelState::ParseParameters() -{ - triton::common::TritonJson::Value params; - bool status = model_config_.Find("parameters", ¶ms); - if (status) { - // If 'DISABLE_OPTIMIZED_EXECUTION' is not present in 'parameters' then no - // update is made to 'enable_optimized_execution_'. - bool disable_optimized_execution = false; - TRITONSERVER_Error* err = ParseParameter( - params, "DISABLE_OPTIMIZED_EXECUTION", &disable_optimized_execution); - if (err != nullptr) { - if (TRITONSERVER_ErrorCode(err) != TRITONSERVER_ERROR_NOT_FOUND) { - return err; - } else { - TRITONSERVER_ErrorDelete(err); - } - } - enable_optimized_execution_ = !disable_optimized_execution; - - LOG_MESSAGE( - TRITONSERVER_LOG_INFO, - (std::string("Optimized execution is ") + - (enable_optimized_execution_ ? "enabled" : "disabled") + - " for model instance '" + Name() + "'") - .c_str()); - - // If 'ENABLE_CACHE_CLEANING' is not present in 'parameters' then - // no update is made to 'enable_cache_cleaning_'. - err = ParseParameter( - params, "ENABLE_CACHE_CLEANING", &enable_cache_cleaning_); - if (err != nullptr) { - if (TRITONSERVER_ErrorCode(err) != TRITONSERVER_ERROR_NOT_FOUND) { - return err; - } else { - TRITONSERVER_ErrorDelete(err); - } - } - - LOG_MESSAGE( - TRITONSERVER_LOG_INFO, - (std::string("Cache Cleaning is ") + - (enable_cache_cleaning_ ? "enabled" : "disabled") + - " for model instance '" + Name() + "'") - .c_str()); - - // If 'INFERENCE_MODE' is not present in 'parameters' then no update is made - // to 'enable_inference_mode_'. - err = ParseParameter(params, "INFERENCE_MODE", &enable_inference_mode_); - if (err != nullptr) { - if (TRITONSERVER_ErrorCode(err) != TRITONSERVER_ERROR_NOT_FOUND) { - return err; - } else { - TRITONSERVER_ErrorDelete(err); - } - } - LOG_MESSAGE( - TRITONSERVER_LOG_INFO, - (std::string("Inference Mode is ") + - (enable_inference_mode_ ? "enabled" : "disabled") + - " for model instance '" + Name() + "'") - .c_str()); - - // If 'DISABLE_CUDNN' is not present in 'parameters' then no update is made - // to 'enable_cudnn_'. - bool disable_cudnn = false; - err = ParseParameter(params, "DISABLE_CUDNN", &disable_cudnn); - if (err != nullptr) { - if (TRITONSERVER_ErrorCode(err) != TRITONSERVER_ERROR_NOT_FOUND) { - return err; - } else { - TRITONSERVER_ErrorDelete(err); - } - } - enable_cudnn_ = !disable_cudnn; - LOG_MESSAGE( - TRITONSERVER_LOG_INFO, - (std::string("cuDNN is ") + (enable_cudnn_ ? "enabled" : "disabled") + - " for model instance '" + Name() + "'") - .c_str()); - - // If 'ENABLE_TENSOR_FUSER' is not present in 'parameters' then no - // update is made to 'enable_tensor_fuser'. - bool enable_tensor_fuser = false; - err = ParseParameter(params, "ENABLE_TENSOR_FUSER", &enable_tensor_fuser); - if (err != nullptr) { - if (TRITONSERVER_ErrorCode(err) != TRITONSERVER_ERROR_NOT_FOUND) { - return err; - } else { - TRITONSERVER_ErrorDelete(err); - } - } else { - enable_tensor_fuser_pair_ = {true, enable_tensor_fuser}; - LOG_MESSAGE( - TRITONSERVER_LOG_INFO, - (std::string("Tensor fuser is ") + - (enable_tensor_fuser ? "enabled" : "disabled") + - " for model instance '" + Name() + "'") - .c_str()); - } - - // If 'ENABLE_WEIGHT_SHARING' is not present in 'parameters' then no - // update is made to 'enable_weight_sharing'. - err = ParseParameter( - params, "ENABLE_WEIGHT_SHARING", &enable_weight_sharing_); - if (err != nullptr) { - if (TRITONSERVER_ErrorCode(err) != TRITONSERVER_ERROR_NOT_FOUND) { - return err; - } else { - TRITONSERVER_ErrorDelete(err); - } - } else { - LOG_MESSAGE( - TRITONSERVER_LOG_INFO, - (std::string("Weight sharing is ") + - (enable_weight_sharing_ ? "enabled" : "disabled") + - " for model instance '" + Name() + "'") - .c_str()); - } - - // If 'ENABLE_JIT_PROFILING' is not present in 'parameters' then no update - // is made to 'enable_jit_profiling'. - bool enable_jit_profiling = false; - err = ParseParameter(params, "ENABLE_JIT_PROFILING", &enable_jit_profiling); - if (err != nullptr) { - if (TRITONSERVER_ErrorCode(err) != TRITONSERVER_ERROR_NOT_FOUND) { - return err; - } else { - TRITONSERVER_ErrorDelete(err); - } - } else { - enable_jit_profiling_pair_ = {true, enable_jit_profiling}; - LOG_MESSAGE( - TRITONSERVER_LOG_INFO, - (std::string("Jit profiling is ") + - (enable_jit_profiling ? "enabled" : "disabled") + - " for model instance '" + Name() + "'") - .c_str()); - } - - // If 'ENABLE_JIT_EXECUTOR' is not present in 'parameters' then no update is - // made to 'enable_jit_executor'. - bool enable_jit_executor = false; - err = ParseParameter(params, "ENABLE_JIT_EXECUTOR", &enable_jit_executor); - if (err != nullptr) { - if (TRITONSERVER_ErrorCode(err) != TRITONSERVER_ERROR_NOT_FOUND) { - return err; - } else { - TRITONSERVER_ErrorDelete(err); - } - } else { - enable_jit_executor_pair_ = {true, enable_jit_executor}; - LOG_MESSAGE( - TRITONSERVER_LOG_INFO, - (std::string("Jit executor is ") + - (enable_jit_executor ? "enabled" : "disabled") + - " for model instance '" + Name() + "'") - .c_str()); - } - - // If 'INTRA_OP_THREAD_COUNT' is not present in 'parameters' then no update - // is made to 'intra_op_thread_count', which by default will take all - // threads - int intra_op_thread_count = -1; - err = - ParseParameter(params, "INTRA_OP_THREAD_COUNT", &intra_op_thread_count); - if (err != nullptr) { - if (TRITONSERVER_ErrorCode(err) != TRITONSERVER_ERROR_NOT_FOUND) { - return err; - } else { - TRITONSERVER_ErrorDelete(err); - } - } else { - if (intra_op_thread_count > 0) { - // at::set_num_threads() does not throw if called more than once, but - // issues warnings. std::call_once() is useful to limit these. - std::call_once(pytorch_intraop_threads_flag, [intra_op_thread_count]() { - at::set_num_threads(intra_op_thread_count); - }); - LOG_MESSAGE( - TRITONSERVER_LOG_INFO, - (std::string("Intra op thread count is set to ") + - std::to_string(at::get_num_threads()) + " for model instance '" + - Name() + "'") - .c_str()); - } - } - - // If 'INTER_OP_THREAD_COUNT' is not present in 'parameters' then no update - // is made to 'inter_op_thread_count', which by default will take all - // threads - int inter_op_thread_count = -1; - err = - ParseParameter(params, "INTER_OP_THREAD_COUNT", &inter_op_thread_count); - if (err != nullptr) { - if (TRITONSERVER_ErrorCode(err) != TRITONSERVER_ERROR_NOT_FOUND) { - return err; - } else { - TRITONSERVER_ErrorDelete(err); - } - } else { - if (inter_op_thread_count > 0) { - // at::set_num_interop_threads() throws if called more than once. - // std::call_once() should prevent this, but try/catch is additionally - // used for safety. - std::call_once(pytorch_interop_threads_flag, [inter_op_thread_count]() { - try { - at::set_num_interop_threads(inter_op_thread_count); - } - catch (const c10::Error& e) { - // do nothing - } - }); - LOG_MESSAGE( - TRITONSERVER_LOG_INFO, - (std::string("Inter op thread count is set to ") + - std::to_string(at::get_num_interop_threads()) + - " for model instance '" + Name() + "'") - .c_str()); - } - } - } - - return nullptr; -} - -// The naming convention followed for inputs/outputs in the model configuration. -// Outputs don't support FORWARD_ARGUMENT. -enum class NamingConvention { - NAMED_INDEX, - FORWARD_ARGUMENT, - STRICT_CONFIG_ORDERING -}; - -// -// ModelInstanceState -// -// State associated with a model instance. An object of this class is -// created and associated with each TRITONBACKEND_ModelInstance. -// -class ModelInstanceState : public BackendModelInstance { - public: - static TRITONSERVER_Error* Create( - ModelState* model_state, - TRITONBACKEND_ModelInstance* triton_model_instance, - ModelInstanceState** state); - virtual ~ModelInstanceState(); - - // Get the state of the model that corresponds to this instance. - ModelState* StateForModel() const { return model_state_; } - - // Execute... - void ProcessRequests( - TRITONBACKEND_Request** requests, const uint32_t request_count); - - // Clear CUDA cache - void ClearCache(); - - private: - ModelInstanceState( - ModelState* model_state, - TRITONBACKEND_ModelInstance* triton_model_instance); - TRITONSERVER_Error* ValidateBooleanSequenceControl( - triton::common::TritonJson::Value& sequence_batching, - const std::string& control_kind, bool required, bool* have_control); - TRITONSERVER_Error* ValidateTypedSequenceControl( - triton::common::TritonJson::Value& sequence_batching, - const std::string& control_kind, bool required, bool* have_control); - TRITONSERVER_Error* ValidateInputs(const size_t expected_input_cnt); - void AddInputToMap( - NamingConvention naming_convention, - const std::vector allowed_inputs, const std::string& io_name, - const uint32_t index); - TRITONSERVER_Error* ValidateOutputs(); - void Execute( - std::vector* responses, - const uint32_t response_count, - std::vector* input_tensors, - std::vector* output_tensors); - TRITONSERVER_Error* SetInputTensors( - size_t total_batch_size, TRITONBACKEND_Request** requests, - const uint32_t request_count, - std::vector* responses, - BackendInputCollector* collector, std::vector* input_names, - std::vector* input_tensors, bool* cuda_copy); - TRITONSERVER_Error* ReadOutputTensors( - size_t total_batch_size, - const std::vector& output_tensors, - TRITONBACKEND_Request** requests, const uint32_t request_count, - std::vector* responses); - TRITONSERVER_Error* RecordBackendTimestamp( - uint64_t* timestamp, void* cuda_event); - - // Get the naming convention for inputs/outputs from the model configuration - TRITONSERVER_Error* GetNamingConvention( - NamingConvention* naming_convention, - const std::vector& allowed_io); - - // Create CUDA events for statistics collection. - void CreateCudaEvents(const int32_t& device_id); - - // Get the appropriate CUDA stream for input and output handling based on the - // instance group type. - cudaStream_t GetCudaStreamByInstanceKind(); - - // Replace the default CUDA stream with the stream we created to ensure proper - // cuda stream synchronization. - void SetCurrentCudaStream( - const cudaStream_t& stream, const int32_t& device_id); - - // Get the elapsed time between two CUDA events. - float GetCudaEventElapsedTime( - const cudaEvent_t& start_event, const cudaEvent_t& end_event); - - ModelState* model_state_; - - // The full path to the TorchScript model file. - std::string model_path_; - - std::shared_ptr torch_model_; - torch::Device device_; - - // Map from configuration name for an input to the index of - // that input in the model. - std::unordered_map input_index_map_; - uint32_t batch_input_count_ = 0; - - // Map from configuration name for an output to the index of - // that output in the model. - std::unordered_map output_index_map_; - std::unordered_map output_dtype_map_; - - // If the input to the tensor is a dictionary of tensors. - bool is_dict_input_; - - // If the model supports batching. - bool supports_batching_; - - cudaEvent_t compute_input_start_event_; - cudaEvent_t compute_infer_start_event_; - cudaEvent_t compute_output_start_event_; - - // Store the cuda streams created for the 'KIND_MODEL' instance group. - std::vector stream_vec_; - - // The number of available devices. - int device_cnt_; -}; - -TRITONSERVER_Error* -ModelInstanceState::Create( - ModelState* model_state, TRITONBACKEND_ModelInstance* triton_model_instance, - ModelInstanceState** state) -{ - try { - *state = new ModelInstanceState(model_state, triton_model_instance); - } - catch (const BackendModelInstanceException& ex) { - RETURN_ERROR_IF_TRUE( - ex.err_ == nullptr, TRITONSERVER_ERROR_INTERNAL, - std::string("unexpected nullptr in BackendModelInstanceException")); - RETURN_IF_ERROR(ex.err_); - } - - return nullptr; // success -} - -ModelInstanceState::ModelInstanceState( - ModelState* model_state, TRITONBACKEND_ModelInstance* triton_model_instance) - : BackendModelInstance(model_state, triton_model_instance), - model_state_(model_state), device_(torch::kCPU), is_dict_input_(false), - device_cnt_(0) -{ - if (Kind() == TRITONSERVER_INSTANCEGROUPKIND_GPU) { -#ifdef TRITON_ENABLE_GPU - device_ = torch::Device(torch::kCUDA, DeviceId()); - CreateCudaEvents(DeviceId()); -#endif - } - -#ifdef TRITON_ENABLE_GPU - device_cnt_ = torch::cuda::device_count(); -#endif - - THROW_IF_BACKEND_INSTANCE_ERROR(model_state->LoadModel( - ArtifactFilename(), device_, &model_path_, Kind(), &torch_model_)); - - if (Kind() == TRITONSERVER_INSTANCEGROUPKIND_MODEL) { -#ifdef TRITON_ENABLE_GPU - // Since we cannot determine the exact devices used by the model, we create - // a CUDA stream for every available device to ensure proper synchronization - // of CUDA streams. This approach may have implications when a timestamp is - // captured on a device that is not used by the model. Currently, this issue - // is addressed by synchronizing the CUDA streams before recording - // timestamps to prevent timestamp skewing. However, in the future, any - // modifications to the CUDA stream synchronization logic should be handled - // with caution. - for (int i = 0; i < device_cnt_; i++) { - cudaStream_t stream; - THROW_IF_BACKEND_INSTANCE_ERROR( - CreateCudaStream(i, 0 /* cuda_stream_priority */, &stream)); - stream_vec_.push_back(stream); - } - if (!stream_vec_.empty()) { - // Create CUDA events on the first device that will be used for collecting - // inputs/outputs. - CreateCudaEvents(0); - } -#endif - } - - size_t expected_input_cnt = 0; - { - triton::common::TritonJson::Value inputs; - if (model_state->ModelConfig().Find("input", &inputs)) { - expected_input_cnt = inputs.ArraySize(); - } - - triton::common::TritonJson::Value config_batch_inputs; - if (model_state->ModelConfig().Find("batch_input", &config_batch_inputs)) { - batch_input_count_ = config_batch_inputs.ArraySize(); - expected_input_cnt += batch_input_count_; - } - } - - // If this is a sequence model then make sure that the required - // inputs are present in the model and have the correct shape and - // datatype. - triton::common::TritonJson::Value sequence_batching; - if (model_state->ModelConfig().Find( - "sequence_batching", &sequence_batching)) { - bool have_start, have_end, have_ready, have_corrid; - THROW_IF_BACKEND_INSTANCE_ERROR(ValidateBooleanSequenceControl( - sequence_batching, "CONTROL_SEQUENCE_START", false /* required */, - &have_start)); - THROW_IF_BACKEND_INSTANCE_ERROR(ValidateBooleanSequenceControl( - sequence_batching, "CONTROL_SEQUENCE_END", false /* required */, - &have_end)); - THROW_IF_BACKEND_INSTANCE_ERROR(ValidateBooleanSequenceControl( - sequence_batching, "CONTROL_SEQUENCE_READY", false /* required */, - &have_ready)); - THROW_IF_BACKEND_INSTANCE_ERROR(ValidateTypedSequenceControl( - sequence_batching, "CONTROL_SEQUENCE_CORRID", false /* required */, - &have_corrid)); - if (have_start) { - expected_input_cnt += 1; - } - if (have_end) { - expected_input_cnt += 1; - } - if (have_ready) { - expected_input_cnt += 1; - } - if (have_corrid) { - expected_input_cnt += 1; - } - // Add the state inputs to the expected count - triton::common::TritonJson::Value states; - if (sequence_batching.Find("state", &states)) { - expected_input_cnt += states.ArraySize(); - } - } - supports_batching_ = model_state_->MaxBatchSize() > 0; - - THROW_IF_BACKEND_INSTANCE_ERROR(ValidateInputs(expected_input_cnt)); - THROW_IF_BACKEND_INSTANCE_ERROR(ValidateOutputs()); -} - -void -ModelInstanceState::ClearCache() -{ -#ifdef TRITON_ENABLE_GPU - if (device_.is_cuda() || - ((Kind() == TRITONSERVER_INSTANCEGROUPKIND_MODEL) && (device_cnt_ > 0))) { - c10::cuda::CUDACachingAllocator::emptyCache(); - } -#endif // TRITON_ENABLE_GPU -} - -ModelInstanceState::~ModelInstanceState() -{ - torch_model_.reset(); - ClearCache(); - - if (Kind() == TRITONSERVER_INSTANCEGROUPKIND_MODEL) { -#ifdef TRITON_ENABLE_GPU - for (size_t i = 0; i < stream_vec_.size(); i++) { - LOG_IF_ERROR( - ConvertCUDAStatusToTritonError( - cudaSetDevice(i), TRITONSERVER_ERROR_INTERNAL, - "Failed to set the device"), - "Failed to set the device"); - - LOG_IF_ERROR( - ConvertCUDAStatusToTritonError( - cudaStreamDestroy(stream_vec_[i]), TRITONSERVER_ERROR_INTERNAL, - "Failed to destroy cuda stream"), - "~ModelInstanceState error: "); - stream_vec_[i] = nullptr; - } -#endif - } -} - -TRITONSERVER_Error* -ModelInstanceState::ValidateBooleanSequenceControl( - triton::common::TritonJson::Value& sequence_batching, - const std::string& control_kind, bool required, bool* have_control) -{ - std::string tensor_name; - std::string tensor_datatype; - RETURN_IF_ERROR(GetBooleanSequenceControlProperties( - sequence_batching, model_state_->Name(), control_kind, required, - &tensor_name, &tensor_datatype, nullptr, nullptr, nullptr, nullptr, - nullptr, nullptr)); - *have_control = !tensor_name.empty(); - if (*have_control) { - std::string deliminator = "__"; - int ip_index = 0; - int start_pos = tensor_name.find(deliminator); - if (start_pos == -1) { - return TRITONSERVER_ErrorNew( - TRITONSERVER_ERROR_INTERNAL, - ("input '" + tensor_name + - "' does not follow __ naming convention.") - .c_str()); - } - - // check if the index part of the name is not an integer - std::string index_str = tensor_name.substr(start_pos + 2); - for (auto itr = index_str.begin(); itr != index_str.end(); itr++) { - if (std::isdigit(*itr) == 0) { - return TRITONSERVER_ErrorNew( - TRITONSERVER_ERROR_INTERNAL, - ("input '" + tensor_name + - "' does not follow __ naming convention.") - .c_str()); - } - } - - ip_index = std::atoi(tensor_name.substr(start_pos + 2).c_str()); - input_index_map_[tensor_name] = ip_index; - } - - return nullptr; // success -} - -TRITONSERVER_Error* -ModelInstanceState::ValidateTypedSequenceControl( - triton::common::TritonJson::Value& sequence_batching, - const std::string& control_kind, bool required, bool* have_control) -{ - std::string tensor_name; - std::string tensor_datatype; - RETURN_IF_ERROR(GetTypedSequenceControlProperties( - sequence_batching, model_state_->Name(), control_kind, required, - &tensor_name, &tensor_datatype)); - *have_control = !tensor_name.empty(); - if (*have_control) { - std::string deliminator = "__"; - int ip_index = 0; - int start_pos = tensor_name.find(deliminator); - if (start_pos == -1) { - return TRITONSERVER_ErrorNew( - TRITONSERVER_ERROR_INTERNAL, - ("input '" + tensor_name + - "' does not follow __ naming convention.") - .c_str()); - } - - // check if the index part of the name is not an integer - std::string index_str = tensor_name.substr(start_pos + 2); - for (auto itr = index_str.begin(); itr != index_str.end(); itr++) { - if (std::isdigit(*itr) == 0) { - return TRITONSERVER_ErrorNew( - TRITONSERVER_ERROR_INTERNAL, - ("input '" + tensor_name + - "' does not follow __ naming convention.") - .c_str()); - } - } - - // check if the data type is supported by PyTorch - if (!ModelConfigDataTypeToTorchType(tensor_datatype).first) { - return TRITONSERVER_ErrorNew( - TRITONSERVER_ERROR_INTERNAL, - ("input '" + tensor_name + "' type '" + tensor_datatype + - "' is not supported by PyTorch.") - .c_str()); - } - - ip_index = std::atoi(tensor_name.substr(start_pos + 2).c_str()); - input_index_map_[tensor_name] = ip_index; - } - - return nullptr; // success -} - -void -ModelInstanceState::AddInputToMap( - NamingConvention naming_convention, - const std::vector allowed_inputs, const std::string& io_name, - const uint32_t index) -{ - std::string deliminator = "__"; - - if (is_dict_input_) { - // If dictionary, index is irrelevant but we use the map to store the - // input names since they are the keys for the dictionary - input_index_map_[io_name] = index; - } else { - switch (naming_convention) { - case NamingConvention::FORWARD_ARGUMENT: { - auto itr = - std::find(allowed_inputs.begin(), allowed_inputs.end(), io_name); - if (itr != allowed_inputs.end()) { - input_index_map_[io_name] = - std::distance(allowed_inputs.begin(), itr); - } - return; - } - case NamingConvention::NAMED_INDEX: { - int start_pos = io_name.find(deliminator); - int ip_index = std::atoi(io_name.substr(start_pos + 2).c_str()); - input_index_map_[io_name] = ip_index; - return; - } - case NamingConvention::STRICT_CONFIG_ORDERING: { - input_index_map_[io_name] = index; - return; - } - } - } -} - -TRITONSERVER_Error* -ModelInstanceState::ValidateInputs(const size_t expected_input_cnt) -{ - // Collect all the expected input tensor names and validate that the model - // configuration specifies only those. - std::vector allowed_inputs; - - const torch::jit::Method& method = torch_model_->get_method("forward"); - const auto& schema = method.function().getSchema(); - const std::vector& arguments = schema.arguments(); - - // Currently, only models with a single input of type Dict(str, Tensor) are - // supported. If the model expects more than one input then they must be all - // be of type Tensor. - // - // Ignore the argument at idx 0 if it is of Class type (self param in forward - // function) - size_t start_idx = 0; - if ((arguments.size() > 0) && - (arguments.at(0).type()->kind() == c10::TypeKind::ClassType)) { - start_idx = 1; - } - if ((arguments.size() == (1 + start_idx)) && - (arguments.at(start_idx).type()->kind() == c10::TypeKind::DictType)) { - is_dict_input_ = true; - } else if (arguments.size() > start_idx) { - // Return error if multiple inputs are of kind DictType - for (size_t i = start_idx + 1; i < arguments.size(); i++) { - if (arguments.at(i).type()->kind() == c10::TypeKind::DictType) { - return TRITONSERVER_ErrorNew( - TRITONSERVER_ERROR_INTERNAL, - "Multiple inputs of kind DictType were detected. Only a single " - "input of type Dict(str, Tensor) is supported."); - } - } - - // Return error if all inputs are not of type Tensor - for (size_t i = start_idx; i < arguments.size(); i++) { - if ((arguments.at(i).type()->kind() != c10::TypeKind::TensorType) && - (arguments.at(i).type()->kind() != c10::TypeKind::ListType)) { - return TRITONSERVER_ErrorNew( - TRITONSERVER_ERROR_INTERNAL, - (std::string("An input of type '") + arguments.at(i).type()->str() + - "' was detected in the model. Only a single input of type " - "Dict(str, Tensor) or input(s) of type Tensor are supported.") - .c_str()); - } - allowed_inputs.emplace_back(arguments.at(i).name()); - } - - // If all inputs are tensors, match number of expected inputs between model - // and configuration - if ((arguments.size() - start_idx) != expected_input_cnt) { - return TRITONSERVER_ErrorNew( - TRITONSERVER_ERROR_INVALID_ARG, - (std::string("unable to load model '") + model_state_->Name() + - "', configuration expects " + std::to_string(expected_input_cnt) + - " inputs, model provides " + - std::to_string(arguments.size() - start_idx)) - .c_str()); - } - } - - triton::common::TritonJson::Value ios; - RETURN_IF_ERROR(model_state_->ModelConfig().MemberAsArray("input", &ios)); - - if (ios.ArraySize() == 0) { - return TRITONSERVER_ErrorNew( - TRITONSERVER_ERROR_INTERNAL, - "model configuration must contain at least one input, none were " - "specified."); - } - - NamingConvention naming_convention; - RETURN_IF_ERROR(GetNamingConvention(&naming_convention, allowed_inputs)); - - for (size_t i = 0; i < ios.ArraySize(); i++) { - triton::common::TritonJson::Value io; - RETURN_IF_ERROR(ios.IndexAsObject(i, &io)); - - // Validate name - std::string io_name; - RETURN_IF_ERROR(io.MemberAsString("name", &io_name)); - AddInputToMap(naming_convention, allowed_inputs, io_name, i); - // Validate data type - std::string io_dtype; - RETURN_IF_ERROR(io.MemberAsString("data_type", &io_dtype)); - const auto pr = ModelConfigDataTypeToTorchType(io_dtype); - if (!pr.first && (io_dtype != "TYPE_STRING")) { - return TRITONSERVER_ErrorNew( - TRITONSERVER_ERROR_INTERNAL, - ("unsupported datatype " + io_dtype + " for input '" + io_name + - "' for model '" + model_state_->Name() + "'") - .c_str()); - } - - // Validate shape for String inputs. Only allow 1 dimension. - if (io_dtype == "TYPE_STRING") { - // If a reshape is provided for the input then use that when - // validating the model shapes. - std::vector dims; - triton::common::TritonJson::Value reshape; - if (io.Find("reshape", &reshape)) { - RETURN_IF_ERROR(ParseShape(reshape, "shape", &dims)); - } else { - RETURN_IF_ERROR(ParseShape(io, "dims", &dims)); - } - - if ((dims.size() + (supports_batching_ ? 1 : 0)) > 1) { - return TRITONSERVER_ErrorNew( - TRITONSERVER_ERROR_INTERNAL, - ("Triton only supports 1 dimensional List of String as input for " - "'" + - std::string(io_name) + "' for model '" + model_state_->Name() + - "'") - .c_str()); - } - } - } - triton::common::TritonJson::Value sequence_batching; - if (model_state_->ModelConfig().Find( - "sequence_batching", &sequence_batching)) { - triton::common::TritonJson::Value states; - if (sequence_batching.Find("state", &states)) { - for (size_t i = 0; i < states.ArraySize(); i++) { - triton::common::TritonJson::Value state; - RETURN_IF_ERROR(states.IndexAsObject(i, &state)); - std::string state_name; - RETURN_IF_ERROR(state.MemberAsString("input_name", &state_name)); - AddInputToMap(naming_convention, allowed_inputs, state_name, i); - - // Validate data type - std::string state_dtype; - RETURN_IF_ERROR(state.MemberAsString("data_type", &state_dtype)); - const auto pr = ModelConfigDataTypeToTorchType(state_dtype); - if (!pr.first && (state_dtype != "TYPE_STRING")) { - return TRITONSERVER_ErrorNew( - TRITONSERVER_ERROR_INTERNAL, - ("unsupported datatype " + state_dtype + " for input state '" + - state_name + "' for model '" + model_state_->Name() + "'") - .c_str()); - } - - // Validate shape for String inputs. Only allow 1 dimension. - if (state_dtype == "TYPE_STRING") { - std::vector dims; - if ((dims.size() + (supports_batching_ ? 1 : 0)) > 1) { - return TRITONSERVER_ErrorNew( - TRITONSERVER_ERROR_INTERNAL, - ("Triton only supports 1 dimensional List of String as input " - "for " - "'" + - std::string(state_name) + "' for model '" + - model_state_->Name() + "'") - .c_str()); - } - } - } - } - } - - triton::common::TritonJson::Value batch_inputs; - RETURN_IF_ERROR( - model_state_->ModelConfig().MemberAsArray("batch_input", &batch_inputs)); - size_t i = 0; - for (const auto& batch_input : StateForModel()->BatchInputs()) { - for (const auto& input_name : batch_input.TargetNames()) { - AddInputToMap( - naming_convention, allowed_inputs, input_name, i + ios.ArraySize()); - i++; - } - } - - return nullptr; // success -} - -TRITONSERVER_Error* -ModelInstanceState::ValidateOutputs() -{ - triton::common::TritonJson::Value ios; - RETURN_IF_ERROR(model_state_->ModelConfig().MemberAsArray("output", &ios)); - std::string deliminator = "__"; - int op_index = 0; - - if (ios.ArraySize() == 0) { - return TRITONSERVER_ErrorNew( - TRITONSERVER_ERROR_INTERNAL, - "model configuration must contain at least one output, none were " - "specified."); - } - - NamingConvention naming_convention; - RETURN_IF_ERROR(GetNamingConvention(&naming_convention, {})); - - for (size_t i = 0; i < ios.ArraySize(); i++) { - triton::common::TritonJson::Value io; - RETURN_IF_ERROR(ios.IndexAsObject(i, &io)); - - // Validate name - std::string io_name; - RETURN_IF_ERROR(io.MemberAsString("name", &io_name)); - switch (naming_convention) { - case NamingConvention::NAMED_INDEX: { - int start_pos = io_name.find(deliminator); - op_index = std::atoi(io_name.substr(start_pos + 2).c_str()); - break; - } - case NamingConvention::STRICT_CONFIG_ORDERING: { - op_index = i; - break; - } - default: - break; - } - - // Validate data type - std::string io_dtype; - RETURN_IF_ERROR(io.MemberAsString("data_type", &io_dtype)); - const auto pr = ModelConfigDataTypeToTorchType(io_dtype); - if (!pr.first && (io_dtype != "TYPE_STRING")) { - return TRITONSERVER_ErrorNew( - TRITONSERVER_ERROR_INTERNAL, - ("unsupported datatype " + io_dtype + " for output '" + io_name + - "' for model '" + model_state_->Name() + "'") - .c_str()); - } - - // Validate shape for String outputs. Only allow 1 dimension. - if (io_dtype == "TYPE_STRING") { - // If a reshape is provided for the output then use that when - // validating the model shapes. - std::vector dims; - triton::common::TritonJson::Value reshape; - if (io.Find("reshape", &reshape)) { - RETURN_IF_ERROR(ParseShape(reshape, "shape", &dims)); - } else { - RETURN_IF_ERROR(ParseShape(io, "dims", &dims)); - } - - if ((dims.size() + (supports_batching_ ? 1 : 0)) > 1) { - return TRITONSERVER_ErrorNew( - TRITONSERVER_ERROR_INTERNAL, - ("Triton only supports 1 dimensional List of String as output for " - "'" + - std::string(io_name) + "' for model '" + model_state_->Name() + - "'") - .c_str()); - } - } - - output_index_map_[io_name] = op_index; - output_dtype_map_[io_name] = ConvertTorchTypeToDataType(pr.second); - } - - triton::common::TritonJson::Value sequence_batching; - if (model_state_->ModelConfig().Find( - "sequence_batching", &sequence_batching)) { - triton::common::TritonJson::Value states; - if (sequence_batching.Find("state", &states)) { - for (size_t i = 0; i < states.ArraySize(); i++) { - triton::common::TritonJson::Value state; - RETURN_IF_ERROR(states.IndexAsObject(i, &state)); - std::string state_name; - RETURN_IF_ERROR(state.MemberAsString("output_name", &state_name)); - std::string state_dtype; - RETURN_IF_ERROR(state.MemberAsString("data_type", &state_dtype)); - std::vector dims; - RETURN_IF_ERROR(ParseShape(state, "dims", &dims)); - - // For state, naming convention is enforced to be NAMED_INDEX - int start_pos = state_name.find(deliminator); - op_index = std::atoi(state_name.substr(start_pos + 2).c_str()); - - const auto pr = ModelConfigDataTypeToTorchType(state_dtype); - if (!pr.first && (state_dtype != "TYPE_STRING")) { - return TRITONSERVER_ErrorNew( - TRITONSERVER_ERROR_INTERNAL, - ("unsupported datatype " + state_dtype + " for state '" + - state_name + "' for model '" + model_state_->Name() + "'") - .c_str()); - } - - // Validate shape for String outputs. Only allow 1 dimension. - if (state_dtype == "TYPE_STRING") { - if ((dims.size() + (supports_batching_ ? 1 : 0)) > 1) { - return TRITONSERVER_ErrorNew( - TRITONSERVER_ERROR_INTERNAL, - ("Triton only supports 1 dimensional List of String as output " - "for " - "'" + - std::string(state_name) + "' for model '" + - model_state_->Name() + "'") - .c_str()); - } - } - - output_index_map_[state_name] = op_index; - output_dtype_map_[state_name] = ConvertTorchTypeToDataType(pr.second); - } - } - } - - return nullptr; // success -} - -void -ModelInstanceState::ProcessRequests( - TRITONBACKEND_Request** requests, const uint32_t request_count) -{ - LOG_MESSAGE( - TRITONSERVER_LOG_VERBOSE, - (std::string("TRITONBACKEND_ModelExecute: Running ") + Name() + " with " + - std::to_string(request_count) + " requests") - .c_str()); - -#ifdef TRITON_ENABLE_GPU - if (Kind() == TRITONSERVER_INSTANCEGROUPKIND_GPU) { - SetCurrentCudaStream(stream_, DeviceId()); - } else if (Kind() == TRITONSERVER_INSTANCEGROUPKIND_MODEL) { - // Replace the default stream of each device with the one we created. - for (size_t i = 0; i < stream_vec_.size(); i++) { - SetCurrentCudaStream(stream_vec_[i], i); - } - } -#endif - - NVTX_RANGE(nvtx_, "ProcessRequests " + Name()); - - uint64_t exec_start_ns = 0; - SET_TIMESTAMP(exec_start_ns); - - const int max_batch_size = model_state_->MaxBatchSize(); - - // For each request collect the total batch size for this inference - // execution. The batch-size, number of inputs, and size of each - // input has already been checked so don't need to do that here. - size_t total_batch_size = 0; - for (size_t i = 0; i < request_count; i++) { - // If we get a nullptr request then something is badly wrong. Fail - // and release all requests. - if (requests[i] == nullptr) { - RequestsRespondWithError( - requests, request_count, - TRITONSERVER_ErrorNew( - TRITONSERVER_ERROR_INTERNAL, - std::string( - "null request given to PyTorch backend for '" + Name() + "'") - .c_str())); - return; - } - } - - // At this point we are committed to running inference with all - // 'requests'. Create a response for each request. During input - // processing if there is an error with any request that error will - // be sent immediately with the corresponding response (and the - // response unique_ptr will then be nullptr). The request object - // itself will not be released until after all inferencing is done - // (below) as we may need to access the request object when - // determine how to process outputs (for example, even if we don't - // need the outputs for a request that has an error, we do need to - // know the size of those outputs associated with the request so we - // can skip them in the output tensors). - std::vector responses; - responses.reserve(request_count); - bool all_response_failed = false; - - for (size_t i = 0; i < request_count; i++) { - TRITONBACKEND_Response* response; - auto err = TRITONBACKEND_ResponseNew(&response, requests[i]); - if (err == nullptr) { - responses.emplace_back(response); - } else { - responses.emplace_back(nullptr); - LOG_MESSAGE(TRITONSERVER_LOG_ERROR, "Fail to create response"); - TRITONSERVER_ErrorDelete(err); - } - } - - for (size_t i = 0; i < request_count; i++) { - if (max_batch_size > 0) { - // Retrieve the batch size from one of the inputs, if the model - // supports batching, the first dimension size is batch size. - TRITONBACKEND_Input* input; - TRITONSERVER_Error* err = - TRITONBACKEND_RequestInputByIndex(requests[i], 0 /* index */, &input); - if (err == nullptr) { - const int64_t* shape; - err = TRITONBACKEND_InputProperties( - input, nullptr, nullptr, &shape, nullptr, nullptr, nullptr); - total_batch_size += shape[0]; - } - if (err != nullptr) { - RESPOND_ALL_AND_SET_TRUE_IF_ERROR( - responses, request_count, all_response_failed, err); - } - } else { - total_batch_size += 1; - } - } - - // If there are no valid payloads then no need to run the inference. - if (total_batch_size == 0) { - return; - } - - // Make sure the maximum batch size is not exceeded. The - // total_batch_size must be 1 for models that don't support batching - // (i.e. max_batch_size == 0). If max_batch_size is exceeded then - // scheduler has done something badly wrong so fail and release all - // requests. - if (!all_response_failed) { - if ((total_batch_size != 1) && - (total_batch_size > (size_t)max_batch_size)) { - RESPOND_ALL_AND_SET_TRUE_IF_ERROR( - responses, request_count, all_response_failed, - TRITONSERVER_ErrorNew( - TRITONSERVER_ERROR_INTERNAL, - std::string( - "batch size " + std::to_string(total_batch_size) + " for '" + - Name() + "', max allowed is " + - std::to_string(max_batch_size)) - .c_str())); - } - } - - std::vector input_names; - std::vector input_tensors; - bool cuda_copy = false; - std::unique_ptr collector; - - // For 'KIND_MODEL', it's fine to use CUDA events to calculate the compute - // input duration since only one stream will be used for input collection. - if ((Kind() == TRITONSERVER_INSTANCEGROUPKIND_GPU) || - ((Kind() == TRITONSERVER_INSTANCEGROUPKIND_MODEL) && (device_cnt_ > 0))) { -#ifdef TRITON_ENABLE_GPU - RESPOND_ALL_AND_SET_TRUE_IF_ERROR( - responses, request_count, all_response_failed, - ConvertCUDAStatusToTritonError( - cudaEventRecord( - compute_input_start_event_, GetCudaStreamByInstanceKind()), - TRITONSERVER_ERROR_INTERNAL, "Failed to record the event.")); -#endif - } - - if (!all_response_failed) { - collector.reset(new BackendInputCollector( - requests, request_count, &responses, - model_state_->TritonMemoryManager(), model_state_->EnablePinnedInput(), - GetCudaStreamByInstanceKind(), nullptr, nullptr, 0, - HostPolicyName().c_str())); - RESPOND_ALL_AND_SET_TRUE_IF_ERROR( - responses, request_count, all_response_failed, - SetInputTensors( - total_batch_size, requests, request_count, &responses, - collector.get(), &input_names, &input_tensors, &cuda_copy)); - } - -#ifdef TRITON_ENABLE_GPU - if (cuda_copy) { - cudaStreamSynchronize(GetCudaStreamByInstanceKind()); - cuda_copy = false; - } -#endif - - std::vector output_tensors; - uint64_t compute_start_ns = 0; - uint64_t compute_infer_start = 0; - - RESPOND_ALL_AND_SET_TRUE_IF_ERROR( - responses, request_count, all_response_failed, - RecordBackendTimestamp( - &compute_start_ns, - reinterpret_cast(&compute_infer_start_event_))); - - // For 'KIND_MODEL', capture the timestamp for the compute infer duration. - if ((Kind() == TRITONSERVER_INSTANCEGROUPKIND_MODEL) && (device_cnt_ > 0)) { - SET_TIMESTAMP(compute_infer_start); - } - - // Run... - if (!all_response_failed) { - Execute(&responses, request_count, &input_tensors, &output_tensors); - } - - // Verify output indices are valid with number of outputs after execution - bool invalid_index = false; - int max_index = output_tensors.size() - 1; - - if (!all_response_failed) { - for (const auto& name : model_state_->ModelOutputs()) { - int op_index = output_index_map_[name.first]; - if ((op_index < 0) || (op_index > max_index)) { - RESPOND_ALL_AND_SET_TRUE_IF_ERROR( - responses, request_count, all_response_failed, - TRITONSERVER_ErrorNew( - TRITONSERVER_ERROR_INVALID_ARG, - std::string( - "The output " + std::string(name.first) + - " in the model configuration refers to an output index " - "which doesn't exist. This model has " + - std::to_string(max_index + 1) + " outputs") - .c_str())); - invalid_index = true; - break; - } - } - } - -#ifdef TRITON_ENABLE_GPU - if (Kind() == TRITONSERVER_INSTANCEGROUPKIND_MODEL) { - // For 'KIND_MODEL', multiple streams will be involved, so we need to call - // 'cudaStreamSynchronize' before reading the output tensors. - for (auto& stream : stream_vec_) { - cudaStreamSynchronize(stream); - } - } -#endif - - uint64_t compute_end_ns = 0; - uint64_t compute_output_start = 0; - - if ((Kind() == TRITONSERVER_INSTANCEGROUPKIND_MODEL) && (device_cnt_ > 0)) { -#ifdef TRITON_ENABLE_GPU - SET_TIMESTAMP(compute_output_start); -#endif - } else { - RESPOND_ALL_AND_SET_TRUE_IF_ERROR( - responses, request_count, all_response_failed, - RecordBackendTimestamp( - &compute_end_ns, - reinterpret_cast(&compute_output_start_event_))); - } - - if (!all_response_failed) { - if (!invalid_index) { - RESPOND_ALL_AND_SET_TRUE_IF_ERROR( - responses, request_count, all_response_failed, - ReadOutputTensors( - total_batch_size, output_tensors, requests, request_count, - &responses)); - } - } - - uint64_t exec_end_ns = 0; - SET_TIMESTAMP(exec_end_ns); - - // Send all the responses that haven't already been sent because of - // an earlier error. Note that the responses are not set to nullptr - // here as we need that indication below to determine if the request - // we successful or not. - for (auto& response : responses) { - if (response != nullptr) { - LOG_IF_ERROR( - TRITONBACKEND_ResponseSend( - response, TRITONSERVER_RESPONSE_COMPLETE_FINAL, nullptr), - "failed to send PyTorch backend response"); - } - } - - // We don't need an explicit CUDA syncrhonization here since we have already - // synchronized the stream in the ReadOutputTensors function. - if (Kind() == TRITONSERVER_INSTANCEGROUPKIND_GPU) { -#ifdef TRITON_ENABLE_GPU - float compute_input_duration = GetCudaEventElapsedTime( - compute_input_start_event_, compute_infer_start_event_); - float compute_infer_duration = GetCudaEventElapsedTime( - compute_infer_start_event_, compute_output_start_event_); - - compute_start_ns = exec_start_ns + (compute_input_duration * 1e6); - compute_end_ns = compute_start_ns + (compute_infer_duration * 1e6); -#endif - } else if ( - (Kind() == TRITONSERVER_INSTANCEGROUPKIND_MODEL) && (device_cnt_ > 0)) { -#ifdef TRITON_ENABLE_GPU - float compute_input_duration = GetCudaEventElapsedTime( - compute_input_start_event_, compute_infer_start_event_); - uint64_t compute_infer_duration = - compute_output_start - compute_infer_start; - - compute_start_ns = exec_start_ns + (compute_input_duration * 1e6); - compute_end_ns = compute_start_ns + compute_infer_duration; -#endif - } - - // Report statistics for each request. - for (uint32_t r = 0; r < request_count; ++r) { - auto& request = requests[r]; - LOG_IF_ERROR( - TRITONBACKEND_ModelInstanceReportStatistics( - TritonModelInstance(), request, - (responses[r] != nullptr) /* success */, exec_start_ns, - compute_start_ns, compute_end_ns, exec_end_ns), - "failed reporting request statistics"); - - LOG_IF_ERROR( - TRITONBACKEND_RequestRelease(request, TRITONSERVER_REQUEST_RELEASE_ALL), - "failed releasing request"); - } - - if (!all_response_failed) { - // Report the entire batch statistics. - LOG_IF_ERROR( - TRITONBACKEND_ModelInstanceReportBatchStatistics( - TritonModelInstance(), total_batch_size, exec_start_ns, - compute_start_ns, compute_end_ns, exec_end_ns), - "failed reporting batch request statistics"); - } -} - -void -ModelInstanceState::Execute( - std::vector* responses, - const uint32_t response_count, - std::vector* input_tensors, - std::vector* output_tensors) -{ - NVTX_RANGE(nvtx_, "Execute " + Name()); - - torch::jit::IValue model_outputs_; - - try { - // enable/disable optimized execution - torch::jit::setGraphExecutorOptimize( - model_state_->EnabledOptimizedExecution()); - - // enable/disable inference mode - supersedes NoGradGuard - torch::InferenceMode infer_guard(model_state_->EnabledInferenceMode()); - - // enable/disable cudnn - at::globalContext().setUserEnabledCuDNN(model_state_->EnabledCudnn()); - - // JIT. No change is made unless parameter is explicitly set. - if (std::get<0>(model_state_->EnabledJitProfiling())) { - torch::jit::getProfilingMode() = - std::get<1>(model_state_->EnabledJitProfiling()); - } - - if (std::get<0>(model_state_->EnabledJitExecutor())) { - torch::jit::getExecutorMode() = - std::get<1>(model_state_->EnabledJitExecutor()); - } - - // Fuser. No change is made unless fuser is explicitly set in - // parameters. - if (std::get<0>(model_state_->EnabledTensorExprFuser())) { - torch::jit::setTensorExprFuserEnabled( - std::get<1>(model_state_->EnabledTensorExprFuser())); - } - - torch::NoGradGuard no_grad; - - // If input is a dictionary, prepare dictionary from 'input_tensors'. - if (is_dict_input_) { - torch::Dict input_dict; - for (auto& input_index : input_index_map_) { - torch::jit::IValue ival = (*input_tensors)[input_index.second]; - input_dict.insert(input_index.first, ival.toTensor()); - } - std::vector input_dict_ivalue = {input_dict}; - model_outputs_ = torch_model_->forward(input_dict_ivalue); - } else { - model_outputs_ = torch_model_->forward(*input_tensors); - } - - if (model_outputs_.isTuple()) { - auto model_outputs_tuple = model_outputs_.toTuple(); - size_t op_index = 0; - for (auto& m_op : model_outputs_tuple->elements()) { - if (m_op.isList()) { - auto list_output = m_op.toList(); - if (list_output.elementType()->kind() != c10::TypeKind::StringType) { - throw std::invalid_argument( - "output at index " + std::to_string(op_index) + - " must be of type Tensor or List[str], received List[" + - list_output.elementType()->str() + "]"); - } - output_tensors->push_back(m_op); - } else { - auto tensor_output = m_op.toTensor(); - output_tensors->push_back(m_op); - } - op_index++; - } - } else if (model_outputs_.isTensor()) { - output_tensors->push_back(model_outputs_); - } else if (model_outputs_.isList()) { - auto list_output = model_outputs_.toList(); - if (list_output.elementType()->kind() != c10::TypeKind::StringType) { - throw std::invalid_argument( - "output must be of type Tensor or List[str], received List[" + - list_output.elementType()->str() + "]"); - } - output_tensors->push_back(model_outputs_); - } else { - throw std::invalid_argument( - "output must be of type Tensor, List[str] or Tuple containing one of " - "these two types. It should not be a List / Dictionary of Tensors or " - "a Scalar"); - } - } - catch (std::exception& ex) { - SendErrorForResponses( - responses, response_count, - TRITONSERVER_ErrorNew( - TRITONSERVER_ERROR_INTERNAL, - ("PyTorch execute failure: " + std::string(ex.what())).c_str())); - } -} - -TRITONSERVER_Error* -ModelInstanceState::GetNamingConvention( - NamingConvention* naming_convention, - const std::vector& allowed_ios) -{ - // Rules for (non-Dictionary) input tensor names: - // 1. Must be in 'allowed_inputs' (arguments in the forward function) - // 2. Must follow the naming convention i.e. __ - // 3. If neither of the above conditions are satisfied, enforce strict - // ordering of model inputs. - // - // Rules for output tensor names: - // 1. Must follow the naming convention i.e. __ - // 2. If not, we enforce strict ordering of model outputs. - std::string deliminator = "__"; - std::string io_kind = "input"; - *naming_convention = NamingConvention::FORWARD_ARGUMENT; - - // symbolizes output - if (allowed_ios.size() == 0) { - io_kind = "output"; - *naming_convention = NamingConvention::NAMED_INDEX; - } - - triton::common::TritonJson::Value ios; - RETURN_IF_ERROR( - model_state_->ModelConfig().MemberAsArray(io_kind.c_str(), &ios)); - - if (io_kind == "input") { - for (size_t i = 0; i < ios.ArraySize(); i++) { - triton::common::TritonJson::Value io; - RETURN_IF_ERROR(ios.IndexAsObject(i, &io)); - - // Validate name - std::string io_name; - RETURN_IF_ERROR(io.MemberAsString("name", &io_name)); - auto itr = std::find(allowed_ios.begin(), allowed_ios.end(), io_name); - if (itr == allowed_ios.end()) { - *naming_convention = NamingConvention::NAMED_INDEX; - break; - } - } - } - - // If not, check if inputs follow INDEX - if (*naming_convention == NamingConvention::NAMED_INDEX) { - for (size_t i = 0; i < ios.ArraySize(); i++) { - triton::common::TritonJson::Value io; - RETURN_IF_ERROR(ios.IndexAsObject(i, &io)); - - // Validate name - std::string io_name; - RETURN_IF_ERROR(io.MemberAsString("name", &io_name)); - int start_pos = io_name.find(deliminator); - if (start_pos == -1) { - *naming_convention = NamingConvention::STRICT_CONFIG_ORDERING; - break; - } else { - // check if the index part of the name is not an integer - std::string index_str = io_name.substr(start_pos + 2); - bool is_int = true; - for (auto itr = index_str.begin(); itr != index_str.end(); itr++) { - if (std::isdigit(*itr) == 0) { - is_int = false; - } - } - - if (!is_int) { - if (io_kind == "input") { - LOG_MESSAGE( - TRITONSERVER_LOG_WARN, - ("input '" + io_name + - "' or previous input(s) are neither an input argument to the " - "model '" + - model_state_->Name() + - "' nor do they follow the __ naming convention. " - "Falling back to enforcing strict ordering from model " - "configuration.") - .c_str()); - } else { - LOG_MESSAGE( - TRITONSERVER_LOG_WARN, - ("output '" + io_name + - "' or previous output(s) of the model '" + - model_state_->Name() + - "' do not follow the __ naming convention. " - "Falling back to enforcing strict ordering from model " - "configuration.") - .c_str()); - } - *naming_convention = NamingConvention::STRICT_CONFIG_ORDERING; - break; - } - } - } - } - - triton::common::TritonJson::Value sequence_batching; - if (model_state_->ModelConfig().Find( - "sequence_batching", &sequence_batching)) { - // If we need to manage state for the model, then we need to check - // the naming of the state adheres to both the input and output conventions - triton::common::TritonJson::Value states; - if (sequence_batching.Find("state", &states)) { - if (*naming_convention != NamingConvention::NAMED_INDEX) { - return TRITONSERVER_ErrorNew( - TRITONSERVER_ERROR_INVALID_ARG, - ("PyTorch model '" + model_state_->Name() + - "' is using sequence batching with state but not all inputs and " - "outputs follow the __ naming convention. ") - .c_str()); - } - } - - for (size_t i = 0; i < states.ArraySize(); i++) { - triton::common::TritonJson::Value state; - RETURN_IF_ERROR(states.IndexAsObject(i, &state)); - std::string name_entry = - io_kind == "input" ? "input_name" : "output_name"; - std::string state_name; - RETURN_IF_ERROR(state.MemberAsString(name_entry.c_str(), &state_name)); - int start_pos = state_name.find(deliminator); - if (start_pos == -1) { - return TRITONSERVER_ErrorNew( - TRITONSERVER_ERROR_INVALID_ARG, - ("PyTorch model '" + model_state_->Name() + - "' is using sequence batching with state but state '" + - state_name + - "' does not follow the __ naming convention. ") - .c_str()); - } else { - // check if the index part of the name is not an integer - std::string index_str = state_name.substr(start_pos + 2); - bool is_int = true; - for (auto itr = index_str.begin(); itr != index_str.end(); itr++) { - if (std::isdigit(*itr) == 0) { - is_int = false; - } - } - if (!is_int) { - return TRITONSERVER_ErrorNew( - TRITONSERVER_ERROR_INVALID_ARG, - ("PyTorch model '" + model_state_->Name() + - "' is using sequence batching with state but state '" + - state_name + - "' does not follow the __ naming convention. ") - .c_str()); - } - } - } - } - - return nullptr; // success -} - -// This function will return a tensor's contents as a contiguous -// chunk in system memory. In some cases this will require copying the data. -// If that happens, 'contiguous_buffer' will be set to hold the contiguous -// chunk and 'cuda_copy' will be set to indicate whether CUDA copy is -// conducted. The data copy can be avoided if the input is already in -// a contiguous chunk and the input is located in memory type and id -// specified. -TRITONSERVER_Error* -GetContiguousInputContent( - TRITONBACKEND_Input* rinput, const uint32_t buffer_count, - const char** content, size_t* content_byte_size, - std::vector* contiguous_buffer, cudaStream_t stream, bool* cuda_copy) -{ - *cuda_copy = false; - - // Check input buffers to see if data copy is necessary - size_t chunk_count = 0; - bool type_mismatch = false; - uint64_t total_byte_size = 0; - for (size_t idx = 0; idx < buffer_count; ++idx) { - TRITONSERVER_MemoryType src_memory_type; - int64_t src_memory_type_id; - size_t src_byte_size; - const void* src_ptr; - - RETURN_IF_ERROR(TRITONBACKEND_InputBuffer( - rinput, idx, &src_ptr, &src_byte_size, &src_memory_type, - &src_memory_type_id)); - - if (src_ptr != nullptr) { - chunk_count++; - total_byte_size += src_byte_size; - type_mismatch |= (src_memory_type == TRITONSERVER_MEMORY_GPU); - } - } - - if (chunk_count == 0) { - *content = nullptr; - *content_byte_size = 0; - } else if ((chunk_count == 1) && !type_mismatch) { - TRITONSERVER_MemoryType src_memory_type; - int64_t src_memory_type_id; - RETURN_IF_ERROR(TRITONBACKEND_InputBuffer( - rinput, 0, (const void**)content, content_byte_size, &src_memory_type, - &src_memory_type_id)); - } else { - contiguous_buffer->resize(total_byte_size); - - size_t offset = 0; - for (size_t i = 0; i < chunk_count; i++) { - bool cuda_used; - TRITONSERVER_MemoryType src_memory_type; - int64_t src_memory_type_id; - size_t src_byte_size; - const void* src_ptr; - - RETURN_IF_ERROR(TRITONBACKEND_InputBuffer( - rinput, i, &src_ptr, &src_byte_size, &src_memory_type, - &src_memory_type_id)); - RETURN_IF_ERROR(CopyBuffer( - "Contiguous input", src_memory_type, src_memory_type_id, - TRITONSERVER_MEMORY_CPU, 0, src_byte_size, src_ptr, - contiguous_buffer->data() + offset, stream, &cuda_used)); - *cuda_copy |= cuda_used; - offset += src_byte_size; - } - - *content = contiguous_buffer->data(); - *content_byte_size = total_byte_size; - } - - return nullptr; // success -} - -void -FillStringTensor(torch::List* input_list, const size_t cnt) -{ - for (size_t c = 0; c < cnt; ++c) { - input_list->push_back(""); - } -} - -bool -SetStringInputTensor( - torch::List* input_list, TRITONBACKEND_Input* input, - const char* name, const uint32_t buffer_count, - const size_t request_element_cnt, TRITONBACKEND_Response** response, - cudaStream_t stream, const char* host_policy_name) -{ - bool cuda_copy = false; - - // For string data type, we always need to have the data on CPU so - // that we can read string length and construct the string - // properly. So if the request's input tensor is not in CPU need to - // copy it there. - const char* content = nullptr; - size_t content_byte_size = 0; - - std::vector contiguous_buffer; - auto err = GetContiguousInputContent( - input, buffer_count, &content, &content_byte_size, &contiguous_buffer, - stream, &cuda_copy); - if (err != nullptr) { - RESPOND_AND_SET_NULL_IF_ERROR(response, err); - FillStringTensor(input_list, request_element_cnt); - return cuda_copy; - } - -#ifdef TRITON_ENABLE_GPU - if (cuda_copy) { - cudaStreamSynchronize(stream); - cuda_copy = false; - } -#endif // TRITON_ENABLE_GPU - - std::vector> str_list; - err = ValidateStringBuffer( - content, content_byte_size, request_element_cnt, name, &str_list); - // Set string values. - for (const auto& [addr, len] : str_list) { - input_list->push_back(std::string(addr, len)); - } - - size_t element_cnt = str_list.size(); - if (err != nullptr) { - RESPOND_AND_SET_NULL_IF_ERROR(response, err); - FillStringTensor(input_list, request_element_cnt - element_cnt); - } - return cuda_copy; -} - -bool -SetStringBuffer( - torch::List* tensor, TRITONBACKEND_Response** response, - TRITONBACKEND_Output* response_output, TRITONBACKEND_State* response_state, - const size_t tensor_element_count, cudaStream_t stream, - std::string* serialized, bool state) -{ - bool cuda_copy = false; - - // Serialize the output tensor strings. Each string is serialized as - // a 4-byte length followed by the string itself with no - // null-terminator. - serialized->clear(); - for (size_t e = 0; e < tensor_element_count; ++e) { - std::string str = tensor->get(e).to(); - const char* cstr = str.c_str(); - size_t len = str.length(); - serialized->append(reinterpret_cast(&len), sizeof(uint32_t)); - if (len > 0) { - serialized->append(cstr, len); - } - } - - // Allocate a buffer large enough to hold the serialized tensor. - TRITONSERVER_MemoryType actual_memory_type = TRITONSERVER_MEMORY_CPU; - int64_t actual_memory_type_id = 0; - - TRITONSERVER_Error* err; - void* buffer; - - if (!state) { - auto err = TRITONBACKEND_OutputBuffer( - response_output, &buffer, serialized->size(), &actual_memory_type, - &actual_memory_type_id); - if (err != nullptr) { - RESPOND_AND_SET_NULL_IF_ERROR(response, err); - return cuda_copy; - } - } else { - auto err = TRITONBACKEND_StateBuffer( - response_state, &buffer, serialized->size(), &actual_memory_type, - &actual_memory_type_id); - if (err != nullptr) { - RESPOND_AND_SET_NULL_IF_ERROR(response, err); - return cuda_copy; - } - } - // Copy the serialized tensor into the allocated buffer. - bool cuda_used = false; - err = CopyBuffer( - "String output", TRITONSERVER_MEMORY_CPU /* src_memory_type */, - 0 /* src_memory_type_id */, actual_memory_type, actual_memory_type_id, - serialized->size(), reinterpret_cast(serialized->c_str()), - buffer, stream, &cuda_used); - cuda_copy |= cuda_used; - - if (err != nullptr) { - RESPOND_AND_SET_NULL_IF_ERROR(response, err); - return cuda_copy; - } - - if (state) { - RESPOND_AND_SET_NULL_IF_ERROR( - response, TRITONBACKEND_StateUpdate(response_state)); - } - - return cuda_copy; -} - - -bool -SetStringOutputBuffer( - torch::List* tensor, TRITONBACKEND_Response** response, - TRITONBACKEND_Output* response_output, const size_t tensor_element_count, - cudaStream_t stream, std::string* serialized) -{ - return SetStringBuffer( - tensor, response, response_output, nullptr /* response_state */, - tensor_element_count, stream, serialized, false /* state */); -} - -bool -SetStringStateBuffer( - torch::List* tensor, TRITONBACKEND_Response** response, - TRITONBACKEND_State* response_state, const size_t tensor_element_count, - cudaStream_t stream, std::string* serialized) -{ - return SetStringBuffer( - tensor, response, nullptr /* response_output */, response_state, - tensor_element_count, stream, serialized, true /* state */); -} - - -TRITONSERVER_Error* -ModelInstanceState::SetInputTensors( - size_t total_batch_size, TRITONBACKEND_Request** requests, - const uint32_t request_count, - std::vector* responses, - BackendInputCollector* collector, std::vector* input_names, - std::vector* input_tensors, bool* cuda_copy) -{ - // InferenceMode should be used to guard all tensors operations - torch::InferenceMode infer_guard(model_state_->EnabledInferenceMode()); - - // All requests must have equally-sized input tensors so use any - // request as the representative for the input tensors. - uint32_t input_count; - RETURN_IF_ERROR(TRITONBACKEND_RequestInputCount(requests[0], &input_count)); - - input_tensors->resize(input_count + batch_input_count_); - - // The inputs must be in contiguous CPU/GPU memory. - std::vector> alloc_perference; - if (device_.is_cpu()) { - alloc_perference = { - {TRITONSERVER_MEMORY_CPU_PINNED, 0}, {TRITONSERVER_MEMORY_CPU, 0}}; - } else { - alloc_perference = {{TRITONSERVER_MEMORY_GPU, device_.index()}}; - } - - for (uint32_t input_idx = 0; input_idx < input_count; input_idx++) { - TRITONBACKEND_Input* input; - RETURN_IF_ERROR( - TRITONBACKEND_RequestInputByIndex(requests[0], input_idx, &input)); - - const char* input_name; - TRITONSERVER_DataType input_datatype; - const int64_t* input_shape; - uint32_t input_dims_count; - RETURN_IF_ERROR(TRITONBACKEND_InputProperties( - input, &input_name, &input_datatype, &input_shape, &input_dims_count, - nullptr, nullptr)); - - input_names->emplace_back(input_name); - - // The shape for the entire input patch, - // [total_batch_size, ...] for non-ragged input and - // [total_element_count] for ragged input (non-nested tensor) - std::vector batchn_shape; - if (StateForModel()->IsInputRagged(input_name)) { - batchn_shape = std::vector{0}; - for (size_t idx = 0; idx < request_count; idx++) { - TRITONBACKEND_Input* input; - RESPOND_AND_SET_NULL_IF_ERROR( - &((*responses)[idx]), - TRITONBACKEND_RequestInput(requests[idx], input_name, &input)); - const int64_t* input_shape; - uint32_t input_dims_count; - RESPOND_AND_SET_NULL_IF_ERROR( - &((*responses)[idx]), TRITONBACKEND_InputProperties( - input, nullptr, nullptr, &input_shape, - &input_dims_count, nullptr, nullptr)); - - int64_t element_cnt = 0; - RESPOND_AND_SET_NULL_IF_ERROR( - &((*responses)[idx]), - GetElementCount(input_shape, input_dims_count, &element_cnt)); - batchn_shape[0] += element_cnt; - } - } else { - batchn_shape = - std::vector(input_shape, input_shape + input_dims_count); - if (supports_batching_) { - batchn_shape[0] = total_batch_size; - } - } - - // The input must be in contiguous CPU/GPU memory. - std::vector> alloc_perference; - // For 'KIND_MODEL', input will always be in CPU as we don't have a way to - // query the input types. - if (device_.is_cpu() || (Kind() == TRITONSERVER_INSTANCEGROUPKIND_MODEL)) { - alloc_perference = { - {TRITONSERVER_MEMORY_CPU_PINNED, 0}, {TRITONSERVER_MEMORY_CPU, 0}}; - } else { - alloc_perference = {{TRITONSERVER_MEMORY_GPU, device_.index()}}; - } - - const char* input_buffer; - size_t batchn_byte_size; - TRITONSERVER_MemoryType memory_type; - int64_t memory_type_id; - RETURN_IF_ERROR(collector->ProcessTensor( - input_name, nullptr, 0, alloc_perference, &input_buffer, - &batchn_byte_size, &memory_type, &memory_type_id)); - - // Create Torch tensor - const auto torch_dtype = ConvertDataTypeToTorchType(input_datatype); - torch::TensorOptions options{torch_dtype.second}; - auto updated_options = (memory_type == TRITONSERVER_MEMORY_GPU) - ? options.device(torch::kCUDA, device_.index()) - : options.device(torch::kCPU); - - if (input_datatype == TRITONSERVER_TYPE_BYTES) { - // Create the PyTorch list to hold the strings. - torch::List input_list; - input_list.reserve(batchn_shape[0]); - - for (size_t idx = 0; idx < request_count; idx++) { - TRITONBACKEND_Input* input; - RESPOND_AND_SET_NULL_IF_ERROR( - &((*responses)[idx]), - TRITONBACKEND_RequestInput(requests[idx], input_name, &input)); - const int64_t* shape; - uint32_t dims_count; - uint32_t buffer_count; - RESPOND_AND_SET_NULL_IF_ERROR( - &((*responses)[idx]), - TRITONBACKEND_InputPropertiesForHostPolicy( - input, HostPolicyName().c_str(), nullptr, nullptr, &shape, - &dims_count, nullptr, &buffer_count)); - - int64_t batch_element_cnt = 0; - RESPOND_AND_SET_NULL_IF_ERROR( - &((*responses)[idx]), - GetElementCount(shape, dims_count, &batch_element_cnt)); - - *cuda_copy |= SetStringInputTensor( - &input_list, input, input_name, buffer_count, batch_element_cnt, - &((*responses)[idx]), GetCudaStreamByInstanceKind(), - HostPolicyName().c_str()); - } - - (*input_tensors)[input_index_map_[input_name]] = input_list; - } else { - if (batchn_byte_size) { - // Remove constness to align with the signature of torch::from_blob() - torch::Tensor input_tensor = torch::from_blob( - const_cast(input_buffer), batchn_shape, updated_options); - (*input_tensors)[input_index_map_[input_name]] = input_tensor; - } else { - // torch:from_blob seems not working when the input size is 0 - // create zero-length inputs directly - torch::Tensor input_tensor = - torch::zeros(batchn_shape, updated_options); - (*input_tensors)[input_index_map_[input_name]] = input_tensor; - } - } - } - - for (const auto& batch_input : StateForModel()->BatchInputs()) { - std::vector shape; - collector->BatchInputShape(batch_input, &shape); - - for (const auto& input_name : batch_input.TargetNames()) { - input_names->emplace_back(input_name.c_str()); - - const char* dst_buffer; - size_t dst_buffer_byte_size; - TRITONSERVER_MemoryType dst_memory_type; - int64_t dst_memory_type_id; - - RESPOND_ALL_AND_SET_NULL_IF_ERROR( - (*responses), responses->size(), - collector->ProcessBatchInput( - batch_input, nullptr, 0, alloc_perference, &dst_buffer, - &dst_buffer_byte_size, &dst_memory_type, &dst_memory_type_id)); - - const auto torch_dtype = - ConvertDataTypeToTorchType(batch_input.DataType()); - torch::TensorOptions options{torch_dtype.second}; - auto updated_options = (dst_memory_type == TRITONSERVER_MEMORY_GPU) - ? options.device(torch::kCUDA, device_.index()) - : options.device(torch::kCPU); - - if (dst_buffer_byte_size) { - torch::Tensor input_tensor = torch::from_blob( - const_cast(dst_buffer), shape, updated_options); - (*input_tensors)[input_index_map_[input_name]] = input_tensor; - } else { - // special handle when input has zero size - torch::Tensor input_tensor = torch::zeros(shape, updated_options); - (*input_tensors)[input_index_map_[input_name]] = input_tensor; - } - } - } - - // Finalize... - *cuda_copy |= collector->Finalize(); - - return nullptr; -} - -TRITONSERVER_Error* -ModelInstanceState::ReadOutputTensors( - size_t total_batch_size, - const std::vector& output_tensors, - TRITONBACKEND_Request** requests, const uint32_t request_count, - std::vector* responses) -{ - NVTX_RANGE(nvtx_, "ReadOutputTensors " + Name()); - - BackendOutputResponder responder( - requests, request_count, responses, model_state_->TritonMemoryManager(), - model_state_->MaxBatchSize() > 0, model_state_->EnablePinnedInput(), - GetCudaStreamByInstanceKind()); - - bool cuda_copy = false; - // The serialized string buffer must be valid until output copies are done - std::vector> string_buffer; - for (auto& output : model_state_->ModelOutputs()) { - int op_index = output_index_map_[output.first]; - auto name = output.first; - auto output_tensor_pair = output.second; - - if (output_tensors[op_index].isTensor()) { - torch::Tensor output_flat; - try { - output_flat = - output_tensors[op_index].toTensor().contiguous().flatten(); - } - catch (std::exception& ex) { - RETURN_IF_ERROR(TRITONSERVER_ErrorNew( - TRITONSERVER_ERROR_INTERNAL, - (std::string("output tensor '") + name + "' is not found") - .c_str())); - } - - // Verify output datatype matches datatype from model config - TRITONSERVER_DataType output_dtype = - ConvertTorchTypeToDataType(output_flat.scalar_type()); - TRITONSERVER_DataType config_datatype = output_dtype_map_[name]; - if (config_datatype != output_dtype) { - RETURN_IF_ERROR(TRITONSERVER_ErrorNew( - TRITONSERVER_ERROR_INVALID_ARG, - (std::string("configuration expects datatype TYPE_") + - TRITONSERVER_DataTypeString(config_datatype) + " for output '" + - name + "', model provides TYPE_" + - TRITONSERVER_DataTypeString(output_dtype)) - .c_str())); - } - - const char* output_buffer = - static_cast(output_flat.data_ptr()); - - // Output tensors may not reside on the same device as model - torch::Device tensor_device = output_flat.device(); - const auto memory_type = (tensor_device.type() == torch::kCPU) - ? TRITONSERVER_MEMORY_CPU - : TRITONSERVER_MEMORY_GPU; - const auto memory_id = - (tensor_device.type() == torch::kCPU) ? 0 : tensor_device.index(); - - // Batch output doesn't support string data type yet, as it is not trivial - // to parse string output - const BatchOutput* batch_output = StateForModel()->FindBatchOutput(name); - if (batch_output == nullptr) { - // Get output shape - std::vector batchn_shape; - auto shape = output_tensors[op_index].toTensor().sizes(); - for (auto itr = shape.begin(); itr != shape.end(); itr++) { - batchn_shape.push_back(*itr); - } - - if (batchn_shape.size() == 0) { - return TRITONSERVER_ErrorNew( - TRITONSERVER_ERROR_INVALID_ARG, - (std::string("output '") + name + - "' is a scalar which is not supported.") - .c_str()); - } - if (output_tensor_pair.first != -1) { - responder.ProcessTensor( - name, output_dtype, batchn_shape, output_buffer, memory_type, - memory_id); - } - if (output_tensor_pair.second != -1) { - std::vector states; - states = responder.ProcessStateTensor( - name, output_dtype, batchn_shape, output_buffer, memory_type, - memory_id); - // Update the states - for (auto& state : states) { - RETURN_IF_ERROR(TRITONBACKEND_StateUpdate(state)); - } - } - - } else { - responder.ProcessBatchOutput( - name, *batch_output, output_buffer, memory_type, memory_id); - } - } else if (output_tensors[op_index].isList()) { - // Custom handling for string/bytes tensor... - torch::List output_list = - output_tensors[op_index].toList(); - - // Get output shape - std::vector batchn_shape{(int64_t)output_list.size()}; - - for (size_t idx = 0; idx < responses->size(); idx++) { - auto& request = requests[idx]; - auto& response = (*responses)[idx]; - - if (supports_batching_ != 0) { - TRITONBACKEND_Input* input; - TRITONBACKEND_RequestInputByIndex(request, 0 /* index*/, &input); - const int64_t* shape; - TRITONBACKEND_InputProperties( - input, nullptr, nullptr, &shape, nullptr, nullptr, nullptr); - batchn_shape[0] = shape[0]; - } - - int64_t tensor_element_cnt = 0; - RETURN_IF_ERROR(GetElementCount(batchn_shape, &tensor_element_cnt)); - - // Only need an response tensor for requested outputs. - if (response != nullptr) { - if (output_tensor_pair.first != -1) { - TRITONBACKEND_Output* response_output; - RESPOND_AND_SET_NULL_IF_ERROR( - &response, TRITONBACKEND_ResponseOutput( - response, &response_output, name.c_str(), - TRITONSERVER_TYPE_BYTES, batchn_shape.data(), - batchn_shape.size())); - string_buffer.emplace_back(new std::string()); - cuda_copy |= SetStringOutputBuffer( - &output_list, &response, response_output, tensor_element_cnt, - GetCudaStreamByInstanceKind(), string_buffer.back().get()); - } - } - if (output_tensor_pair.second != -1) { - TRITONBACKEND_State* response_state; - RESPOND_AND_SET_NULL_IF_ERROR( - &response, TRITONBACKEND_StateNew( - &response_state, request, name.c_str(), - TRITONSERVER_TYPE_BYTES, batchn_shape.data(), - batchn_shape.size())); - - string_buffer.emplace_back(new std::string()); - cuda_copy |= SetStringStateBuffer( - &output_list, &response, response_state, tensor_element_cnt, - GetCudaStreamByInstanceKind(), string_buffer.back().get()); - } - } - } else { - return TRITONSERVER_ErrorNew( - TRITONSERVER_ERROR_INVALID_ARG, - (std::string("output '") + name + - "' must be of type Tensor or List[str].") - .c_str()); - } - } - - // Finalize and wait for any pending buffer copies. - cuda_copy |= responder.Finalize(); - -#ifdef TRITON_ENABLE_GPU - // We have to always synchronize the stream. This is to make sure that - // the events on the cuda stream are synchronized. Otherwise, the events - // are only guaranteed to be synchronized if the model provides the output - // on GPU. - cudaStreamSynchronize(GetCudaStreamByInstanceKind()); -#endif - - return nullptr; -} - -TRITONSERVER_Error* -ModelInstanceState::RecordBackendTimestamp( - uint64_t* timestamp, void* cuda_event) -{ - if ((Kind() == TRITONSERVER_INSTANCEGROUPKIND_GPU) || - ((Kind() == TRITONSERVER_INSTANCEGROUPKIND_MODEL) && (device_cnt_ > 0))) { -#ifdef TRITON_ENABLE_GPU - cudaEvent_t* lcuda_event = reinterpret_cast(cuda_event); - RETURN_IF_ERROR(ConvertCUDAStatusToTritonError( - cudaEventRecord(*lcuda_event, GetCudaStreamByInstanceKind()), - TRITONSERVER_ERROR_INTERNAL, "Failed to record the event.")); -#endif - } else { - SET_TIMESTAMP(*timestamp); - } - return nullptr; -} - -void -ModelInstanceState::CreateCudaEvents(const int32_t& device_id) -{ -#ifdef TRITON_ENABLE_GPU - // Need to set the CUDA context so that the context that events are - // created on match with contexts that events are recorded with. - THROW_IF_BACKEND_INSTANCE_ERROR(ConvertCUDAStatusToTritonError( - cudaSetDevice(device_id), TRITONSERVER_ERROR_INTERNAL, - "Failed to set the device")); - THROW_IF_BACKEND_INSTANCE_ERROR(ConvertCUDAStatusToTritonError( - cudaEventCreate(&compute_input_start_event_), TRITONSERVER_ERROR_INTERNAL, - "Failed to create cuda event")); - THROW_IF_BACKEND_INSTANCE_ERROR(ConvertCUDAStatusToTritonError( - cudaEventCreate(&compute_infer_start_event_), TRITONSERVER_ERROR_INTERNAL, - "Failed to create cuda event")); - THROW_IF_BACKEND_INSTANCE_ERROR(ConvertCUDAStatusToTritonError( - cudaEventCreate(&compute_output_start_event_), - TRITONSERVER_ERROR_INTERNAL, "Failed to create cuda event")); -#endif -} - -cudaStream_t -ModelInstanceState::GetCudaStreamByInstanceKind() -{ -#ifdef TRITON_ENABLE_GPU - if (Kind() == TRITONSERVER_INSTANCEGROUPKIND_GPU) { - return stream_; - } else if ( - (Kind() == TRITONSERVER_INSTANCEGROUPKIND_MODEL) && - !stream_vec_.empty()) { - return stream_vec_[0]; - } -#endif - return nullptr; -} - -void -ModelInstanceState::SetCurrentCudaStream( - const cudaStream_t& stream, const int& device_id) -{ -#ifdef TRITON_ENABLE_GPU - at::cuda::CUDAStream torch_stream = - at::cuda::getStreamFromExternal(stream, device_id); - // This function replaces the default stream with the stream we created. It - // is not necessary to change the current device to the desired device when - // replacing the default stream for that device. See the documentation here: - // https://pytorch.org/cppdocs/api/function_namespacec10_1_1cuda_1a6ed50cc0fc16cc7014d9c2f4c3bd098d.html - at::cuda::setCurrentCUDAStream(torch_stream); -#endif -} - -float -ModelInstanceState::GetCudaEventElapsedTime( - const cudaEvent_t& start_event, const cudaEvent_t& end_event) -{ - float duration = 0; -#ifdef TRITON_ENABLE_GPU - // [FIXME] in the case of cudaEventElapsedTime failure, should handle - // stats reporting more gracefully as the durations are inaccurate - LOG_IF_ERROR( - ConvertCUDAStatusToTritonError( - cudaEventElapsedTime(&duration, start_event, end_event), - TRITONSERVER_ERROR_INTERNAL, "Failed to capture elapsed time"), - "Failed to capture elapsed time"); -#endif - return duration; -} - -///////////// +namespace triton::backend::pytorch { extern "C" { @@ -2704,8 +219,8 @@ TRITONBACKEND_ModelInstanceExecute( } return nullptr; // success -} +}; } // extern "C" -}}} // namespace triton::backend::pytorch +} // namespace triton::backend::pytorch diff --git a/src/libtorch.hh b/src/libtorch.hh new file mode 100644 index 0000000..263c340 --- /dev/null +++ b/src/libtorch.hh @@ -0,0 +1,59 @@ +// Copyright 2019-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions +// are met: +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// * Neither the name of NVIDIA CORPORATION nor the names of its +// contributors may be used to endorse or promote products derived +// from this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +#include "model_instance_state.hh" +#include "model_state.hh" +#include "naming_convention.hh" +#include "string_utilities.hh" + +// +// PyTorch C++ (LibTorch) Backend that implements the TRITONBACKEND API. +// + +namespace triton::backend::pytorch { + +extern "C" { + +TRITONSERVER_Error* TRITONBACKEND_Initialize(TRITONBACKEND_Backend* backend); + +TRITONSERVER_Error* TRITONBACKEND_ModelInitialize(TRITONBACKEND_Model* model); + +TRITONSERVER_Error* TRITONBACKEND_ModelFinalize(TRITONBACKEND_Model* model); + +TRITONSERVER_Error* TRITONBACKEND_ModelInstanceInitialize( + TRITONBACKEND_ModelInstance* instance); + +TRITONSERVER_Error* TRITONBACKEND_ModelInstanceFinalize( + TRITONBACKEND_ModelInstance* instance); + +TRITONSERVER_Error* TRITONBACKEND_ModelInstanceExecute( + TRITONBACKEND_ModelInstance* instance, TRITONBACKEND_Request** requests, + const uint32_t request_count); + +} // extern "C" + + +} // namespace triton::backend::pytorch diff --git a/src/model_instance_state.cc b/src/model_instance_state.cc new file mode 100644 index 0000000..7cd5ee3 --- /dev/null +++ b/src/model_instance_state.cc @@ -0,0 +1,1632 @@ +// Copyright 2019-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions +// are met: +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// * Neither the name of NVIDIA CORPORATION nor the names of its +// contributors may be used to endorse or promote products derived +// from this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +#include "model_instance_state.hh" +#include "string_utilities.hh" + +#ifdef TRITON_PYTORCH_ENABLE_TORCHVISION +// Suppress warnings in torch headers +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wsign-compare" +#pragma warning(push, 0) +#include +#include // Torchvision header +#pragma warning(pop) +#pragma GCC diagnostic pop +#endif // TRITON_PYTORCH_ENABLE_TORCHVISION + +#ifdef TRITON_ENABLE_GPU +#include +#include +#include +#endif // TRITON_ENABLE_GPU + + +namespace triton::backend::pytorch { + +ModelInstanceState::ModelInstanceState( + ModelState* model_state, TRITONBACKEND_ModelInstance* triton_model_instance) + : BackendModelInstance(model_state, triton_model_instance), + model_state_(model_state), device_(torch::kCPU), is_dict_input_(false), + device_cnt_(0) +{ + if (Kind() == TRITONSERVER_INSTANCEGROUPKIND_GPU) { +#ifdef TRITON_ENABLE_GPU + device_ = torch::Device(torch::kCUDA, DeviceId()); + CreateCudaEvents(DeviceId()); +#endif + } + +#ifdef TRITON_ENABLE_GPU + device_cnt_ = torch::cuda::device_count(); +#endif + + THROW_IF_BACKEND_INSTANCE_ERROR(model_state->LoadModel( + ArtifactFilename(), device_, &model_path_, Kind(), &torch_model_)); + + if (Kind() == TRITONSERVER_INSTANCEGROUPKIND_MODEL) { +#ifdef TRITON_ENABLE_GPU + // Since we cannot determine the exact devices used by the model, we create + // a CUDA stream for every available device to ensure proper synchronization + // of CUDA streams. This approach may have implications when a timestamp is + // captured on a device that is not used by the model. Currently, this issue + // is addressed by synchronizing the CUDA streams before recording + // timestamps to prevent timestamp skewing. However, in the future, any + // modifications to the CUDA stream synchronization logic should be handled + // with caution. + for (int i = 0; i < device_cnt_; i++) { + cudaStream_t stream; + THROW_IF_BACKEND_INSTANCE_ERROR( + CreateCudaStream(i, 0 /* cuda_stream_priority */, &stream)); + stream_vec_.push_back(stream); + } + if (!stream_vec_.empty()) { + // Create CUDA events on the first device that will be used for collecting + // inputs/outputs. + CreateCudaEvents(0); + } +#endif + } + + size_t expected_input_cnt = 0; + { + triton::common::TritonJson::Value inputs; + if (model_state->ModelConfig().Find("input", &inputs)) { + expected_input_cnt = inputs.ArraySize(); + } + + triton::common::TritonJson::Value config_batch_inputs; + if (model_state->ModelConfig().Find("batch_input", &config_batch_inputs)) { + batch_input_count_ = config_batch_inputs.ArraySize(); + expected_input_cnt += batch_input_count_; + } + } + + // If this is a sequence model then make sure that the required + // inputs are present in the model and have the correct shape and + // datatype. + triton::common::TritonJson::Value sequence_batching; + if (model_state->ModelConfig().Find( + "sequence_batching", &sequence_batching)) { + bool have_start, have_end, have_ready, have_corrid; + THROW_IF_BACKEND_INSTANCE_ERROR(ValidateBooleanSequenceControl( + sequence_batching, "CONTROL_SEQUENCE_START", false /* required */, + &have_start)); + THROW_IF_BACKEND_INSTANCE_ERROR(ValidateBooleanSequenceControl( + sequence_batching, "CONTROL_SEQUENCE_END", false /* required */, + &have_end)); + THROW_IF_BACKEND_INSTANCE_ERROR(ValidateBooleanSequenceControl( + sequence_batching, "CONTROL_SEQUENCE_READY", false /* required */, + &have_ready)); + THROW_IF_BACKEND_INSTANCE_ERROR(ValidateTypedSequenceControl( + sequence_batching, "CONTROL_SEQUENCE_CORRID", false /* required */, + &have_corrid)); + if (have_start) { + expected_input_cnt += 1; + } + if (have_end) { + expected_input_cnt += 1; + } + if (have_ready) { + expected_input_cnt += 1; + } + if (have_corrid) { + expected_input_cnt += 1; + } + // Add the state inputs to the expected count + triton::common::TritonJson::Value states; + if (sequence_batching.Find("state", &states)) { + expected_input_cnt += states.ArraySize(); + } + } + supports_batching_ = model_state_->MaxBatchSize() > 0; + + THROW_IF_BACKEND_INSTANCE_ERROR(ValidateInputs(expected_input_cnt)); + THROW_IF_BACKEND_INSTANCE_ERROR(ValidateOutputs()); +} + +ModelInstanceState::~ModelInstanceState() +{ + torch_model_.reset(); + ClearCache(); + + if (Kind() == TRITONSERVER_INSTANCEGROUPKIND_MODEL) { +#ifdef TRITON_ENABLE_GPU + for (size_t i = 0; i < stream_vec_.size(); i++) { + LOG_IF_ERROR( + ConvertCUDAStatusToTritonError( + cudaSetDevice(i), TRITONSERVER_ERROR_INTERNAL, + "Failed to set the device"), + "Failed to set the device"); + + LOG_IF_ERROR( + ConvertCUDAStatusToTritonError( + cudaStreamDestroy(stream_vec_[i]), TRITONSERVER_ERROR_INTERNAL, + "Failed to destroy cuda stream"), + "~ModelInstanceState error: "); + stream_vec_[i] = nullptr; + } +#endif + } +} + +void +ModelInstanceState::AddInputToMap( + NamingConvention naming_convention, + const std::vector allowed_inputs, const std::string& io_name, + const uint32_t index) +{ + std::string deliminator = "__"; + + if (is_dict_input_) { + // If dictionary, index is irrelevant but we use the map to store the + // input names since they are the keys for the dictionary + input_index_map_[io_name] = index; + } else { + switch (naming_convention) { + case NamingConvention::FORWARD_ARGUMENT: { + auto itr = + std::find(allowed_inputs.begin(), allowed_inputs.end(), io_name); + if (itr != allowed_inputs.end()) { + input_index_map_[io_name] = + std::distance(allowed_inputs.begin(), itr); + } + return; + } + case NamingConvention::NAMED_INDEX: { + int start_pos = io_name.find(deliminator); + int ip_index = std::atoi(io_name.substr(start_pos + 2).c_str()); + input_index_map_[io_name] = ip_index; + return; + } + case NamingConvention::STRICT_CONFIG_ORDERING: { + input_index_map_[io_name] = index; + return; + } + } + } +} + +void +ModelInstanceState::ClearCache() +{ +#ifdef TRITON_ENABLE_GPU + if (device_.is_cuda() || + ((Kind() == TRITONSERVER_INSTANCEGROUPKIND_MODEL) && (device_cnt_ > 0))) { + c10::cuda::CUDACachingAllocator::emptyCache(); + } +#endif // TRITON_ENABLE_GPU +} + +TRITONSERVER_Error* +ModelInstanceState::Create( + ModelState* model_state, TRITONBACKEND_ModelInstance* triton_model_instance, + ModelInstanceState** state) +{ + try { + *state = new ModelInstanceState(model_state, triton_model_instance); + } + catch (const BackendModelInstanceException& ex) { + RETURN_ERROR_IF_TRUE( + ex.err_ == nullptr, TRITONSERVER_ERROR_INTERNAL, + std::string("unexpected nullptr in BackendModelInstanceException")); + RETURN_IF_ERROR(ex.err_); + } + + return nullptr; // success +} + +void +ModelInstanceState::Execute( + std::vector* responses, + const uint32_t response_count, + std::vector* input_tensors, + std::vector* output_tensors) +{ + NVTX_RANGE(nvtx_, "Execute " + Name()); + + torch::jit::IValue model_outputs_; + + try { + // enable/disable optimized execution + torch::jit::setGraphExecutorOptimize( + model_state_->EnabledOptimizedExecution()); + + // enable/disable inference mode - supersedes NoGradGuard + torch::InferenceMode infer_guard(model_state_->EnabledInferenceMode()); + + // enable/disable cudnn + at::globalContext().setUserEnabledCuDNN(model_state_->EnabledCudnn()); + + // JIT. No change is made unless parameter is explicitly set. + if (std::get<0>(model_state_->EnabledJitProfiling())) { + torch::jit::getProfilingMode() = + std::get<1>(model_state_->EnabledJitProfiling()); + } + + if (std::get<0>(model_state_->EnabledJitExecutor())) { + torch::jit::getExecutorMode() = + std::get<1>(model_state_->EnabledJitExecutor()); + } + + // Fuser. No change is made unless fuser is explicitly set in + // parameters. + if (std::get<0>(model_state_->EnabledTensorExprFuser())) { + torch::jit::setTensorExprFuserEnabled( + std::get<1>(model_state_->EnabledTensorExprFuser())); + } + + torch::NoGradGuard no_grad; + + // If input is a dictionary, prepare dictionary from 'input_tensors'. + if (is_dict_input_) { + torch::Dict input_dict; + for (auto& input_index : input_index_map_) { + torch::jit::IValue ival = (*input_tensors)[input_index.second]; + input_dict.insert(input_index.first, ival.toTensor()); + } + std::vector input_dict_ivalue = {input_dict}; + model_outputs_ = torch_model_->forward(input_dict_ivalue); + } else { + model_outputs_ = torch_model_->forward(*input_tensors); + } + + if (model_outputs_.isTuple()) { + auto model_outputs_tuple = model_outputs_.toTuple(); + size_t op_index = 0; + for (auto& m_op : model_outputs_tuple->elements()) { + if (m_op.isList()) { + auto list_output = m_op.toList(); + if (list_output.elementType()->kind() != c10::TypeKind::StringType) { + throw std::invalid_argument( + "output at index " + std::to_string(op_index) + + " must be of type Tensor or List[str], received List[" + + list_output.elementType()->str() + "]"); + } + output_tensors->push_back(m_op); + } else { + auto tensor_output = m_op.toTensor(); + output_tensors->push_back(m_op); + } + op_index++; + } + } else if (model_outputs_.isTensor()) { + output_tensors->push_back(model_outputs_); + } else if (model_outputs_.isList()) { + auto list_output = model_outputs_.toList(); + if (list_output.elementType()->kind() != c10::TypeKind::StringType) { + throw std::invalid_argument( + "output must be of type Tensor or List[str], received List[" + + list_output.elementType()->str() + "]"); + } + output_tensors->push_back(model_outputs_); + } else { + throw std::invalid_argument( + "output must be of type Tensor, List[str] or Tuple containing one of " + "these two types. It should not be a List / Dictionary of Tensors or " + "a Scalar"); + } + } + catch (std::exception& ex) { + SendErrorForResponses( + responses, response_count, + TRITONSERVER_ErrorNew( + TRITONSERVER_ERROR_INTERNAL, + ("PyTorch execute failure: " + std::string(ex.what())).c_str())); + } +} + +float +ModelInstanceState::GetCudaEventElapsedTime( + const cudaEvent_t& start_event, const cudaEvent_t& end_event) +{ + float duration = 0; +#ifdef TRITON_ENABLE_GPU + // [FIXME] in the case of cudaEventElapsedTime failure, should handle + // stats reporting more gracefully as the durations are inaccurate + LOG_IF_ERROR( + ConvertCUDAStatusToTritonError( + cudaEventElapsedTime(&duration, start_event, end_event), + TRITONSERVER_ERROR_INTERNAL, "Failed to capture elapsed time"), + "Failed to capture elapsed time"); +#endif + return duration; +} + + +cudaStream_t +ModelInstanceState::GetCudaStreamByInstanceKind() +{ +#ifdef TRITON_ENABLE_GPU + if (Kind() == TRITONSERVER_INSTANCEGROUPKIND_GPU) { + return stream_; + } else if ( + (Kind() == TRITONSERVER_INSTANCEGROUPKIND_MODEL) && + !stream_vec_.empty()) { + return stream_vec_[0]; + } +#endif + return nullptr; +} + +TRITONSERVER_Error* +ModelInstanceState::GetNamingConvention( + NamingConvention* naming_convention, + const std::vector& allowed_ios) +{ + // Rules for (non-Dictionary) input tensor names: + // 1. Must be in 'allowed_inputs' (arguments in the forward function) + // 2. Must follow the naming convention i.e. __ + // 3. If neither of the above conditions are satisfied, enforce strict + // ordering of model inputs. + // + // Rules for output tensor names: + // 1. Must follow the naming convention i.e. __ + // 2. If not, we enforce strict ordering of model outputs. + std::string deliminator = "__"; + std::string io_kind = "input"; + *naming_convention = NamingConvention::FORWARD_ARGUMENT; + + // symbolizes output + if (allowed_ios.size() == 0) { + io_kind = "output"; + *naming_convention = NamingConvention::NAMED_INDEX; + } + + triton::common::TritonJson::Value ios; + RETURN_IF_ERROR( + model_state_->ModelConfig().MemberAsArray(io_kind.c_str(), &ios)); + + if (io_kind == "input") { + for (size_t i = 0; i < ios.ArraySize(); i++) { + triton::common::TritonJson::Value io; + RETURN_IF_ERROR(ios.IndexAsObject(i, &io)); + + // Validate name + std::string io_name; + RETURN_IF_ERROR(io.MemberAsString("name", &io_name)); + auto itr = std::find(allowed_ios.begin(), allowed_ios.end(), io_name); + if (itr == allowed_ios.end()) { + *naming_convention = NamingConvention::NAMED_INDEX; + break; + } + } + } + + // If not, check if inputs follow INDEX + if (*naming_convention == NamingConvention::NAMED_INDEX) { + for (size_t i = 0; i < ios.ArraySize(); i++) { + triton::common::TritonJson::Value io; + RETURN_IF_ERROR(ios.IndexAsObject(i, &io)); + + // Validate name + std::string io_name; + RETURN_IF_ERROR(io.MemberAsString("name", &io_name)); + int start_pos = io_name.find(deliminator); + if (start_pos == -1) { + *naming_convention = NamingConvention::STRICT_CONFIG_ORDERING; + break; + } else { + // check if the index part of the name is not an integer + std::string index_str = io_name.substr(start_pos + 2); + bool is_int = true; + for (auto itr = index_str.begin(); itr != index_str.end(); itr++) { + if (std::isdigit(*itr) == 0) { + is_int = false; + } + } + + if (!is_int) { + if (io_kind == "input") { + LOG_MESSAGE( + TRITONSERVER_LOG_WARN, + ("input '" + io_name + + "' or previous input(s) are neither an input argument to the " + "model '" + + model_state_->Name() + + "' nor do they follow the __ naming convention. " + "Falling back to enforcing strict ordering from model " + "configuration.") + .c_str()); + } else { + LOG_MESSAGE( + TRITONSERVER_LOG_WARN, + ("output '" + io_name + + "' or previous output(s) of the model '" + + model_state_->Name() + + "' do not follow the __ naming convention. " + "Falling back to enforcing strict ordering from model " + "configuration.") + .c_str()); + } + *naming_convention = NamingConvention::STRICT_CONFIG_ORDERING; + break; + } + } + } + } + + triton::common::TritonJson::Value sequence_batching; + if (model_state_->ModelConfig().Find( + "sequence_batching", &sequence_batching)) { + // If we need to manage state for the model, then we need to check + // the naming of the state adheres to both the input and output conventions + triton::common::TritonJson::Value states; + if (sequence_batching.Find("state", &states)) { + if (*naming_convention != NamingConvention::NAMED_INDEX) { + return TRITONSERVER_ErrorNew( + TRITONSERVER_ERROR_INVALID_ARG, + ("PyTorch model '" + model_state_->Name() + + "' is using sequence batching with state but not all inputs and " + "outputs follow the __ naming convention. ") + .c_str()); + } + } + + for (size_t i = 0; i < states.ArraySize(); i++) { + triton::common::TritonJson::Value state; + RETURN_IF_ERROR(states.IndexAsObject(i, &state)); + std::string name_entry = + io_kind == "input" ? "input_name" : "output_name"; + std::string state_name; + RETURN_IF_ERROR(state.MemberAsString(name_entry.c_str(), &state_name)); + int start_pos = state_name.find(deliminator); + if (start_pos == -1) { + return TRITONSERVER_ErrorNew( + TRITONSERVER_ERROR_INVALID_ARG, + ("PyTorch model '" + model_state_->Name() + + "' is using sequence batching with state but state '" + + state_name + + "' does not follow the __ naming convention. ") + .c_str()); + } else { + // check if the index part of the name is not an integer + std::string index_str = state_name.substr(start_pos + 2); + bool is_int = true; + for (auto itr = index_str.begin(); itr != index_str.end(); itr++) { + if (std::isdigit(*itr) == 0) { + is_int = false; + } + } + if (!is_int) { + return TRITONSERVER_ErrorNew( + TRITONSERVER_ERROR_INVALID_ARG, + ("PyTorch model '" + model_state_->Name() + + "' is using sequence batching with state but state '" + + state_name + + "' does not follow the __ naming convention. ") + .c_str()); + } + } + } + } + + return nullptr; // success +} + +void +ModelInstanceState::ProcessRequests( + TRITONBACKEND_Request** requests, const uint32_t request_count) +{ + LOG_MESSAGE( + TRITONSERVER_LOG_VERBOSE, + (std::string("TRITONBACKEND_ModelExecute: Running ") + Name() + " with " + + std::to_string(request_count) + " requests") + .c_str()); + +#ifdef TRITON_ENABLE_GPU + if (Kind() == TRITONSERVER_INSTANCEGROUPKIND_GPU) { + SetCurrentCudaStream(stream_, DeviceId()); + } else if (Kind() == TRITONSERVER_INSTANCEGROUPKIND_MODEL) { + // Replace the default stream of each device with the one we created. + for (size_t i = 0; i < stream_vec_.size(); i++) { + SetCurrentCudaStream(stream_vec_[i], i); + } + } +#endif + + NVTX_RANGE(nvtx_, "ProcessRequests " + Name()); + + uint64_t exec_start_ns = 0; + SET_TIMESTAMP(exec_start_ns); + + const int max_batch_size = model_state_->MaxBatchSize(); + + // For each request collect the total batch size for this inference + // execution. The batch-size, number of inputs, and size of each + // input has already been checked so don't need to do that here. + size_t total_batch_size = 0; + for (size_t i = 0; i < request_count; i++) { + // If we get a nullptr request then something is badly wrong. Fail + // and release all requests. + if (requests[i] == nullptr) { + RequestsRespondWithError( + requests, request_count, + TRITONSERVER_ErrorNew( + TRITONSERVER_ERROR_INTERNAL, + std::string( + "null request given to PyTorch backend for '" + Name() + "'") + .c_str())); + return; + } + } + + // At this point we are committed to running inference with all + // 'requests'. Create a response for each request. During input + // processing if there is an error with any request that error will + // be sent immediately with the corresponding response (and the + // response unique_ptr will then be nullptr). The request object + // itself will not be released until after all inferencing is done + // (below) as we may need to access the request object when + // determine how to process outputs (for example, even if we don't + // need the outputs for a request that has an error, we do need to + // know the size of those outputs associated with the request so we + // can skip them in the output tensors). + std::vector responses; + responses.reserve(request_count); + bool all_response_failed = false; + + for (size_t i = 0; i < request_count; i++) { + TRITONBACKEND_Response* response; + auto err = TRITONBACKEND_ResponseNew(&response, requests[i]); + if (err == nullptr) { + responses.emplace_back(response); + } else { + responses.emplace_back(nullptr); + LOG_MESSAGE(TRITONSERVER_LOG_ERROR, "Fail to create response"); + TRITONSERVER_ErrorDelete(err); + } + } + + for (size_t i = 0; i < request_count; i++) { + if (max_batch_size > 0) { + // Retrieve the batch size from one of the inputs, if the model + // supports batching, the first dimension size is batch size. + TRITONBACKEND_Input* input; + TRITONSERVER_Error* err = + TRITONBACKEND_RequestInputByIndex(requests[i], 0 /* index */, &input); + if (err == nullptr) { + const int64_t* shape; + err = TRITONBACKEND_InputProperties( + input, nullptr, nullptr, &shape, nullptr, nullptr, nullptr); + total_batch_size += shape[0]; + } + if (err != nullptr) { + RESPOND_ALL_AND_SET_TRUE_IF_ERROR( + responses, request_count, all_response_failed, err); + } + } else { + total_batch_size += 1; + } + } + + // If there are no valid payloads then no need to run the inference. + if (total_batch_size == 0) { + return; + } + + // Make sure the maximum batch size is not exceeded. The + // total_batch_size must be 1 for models that don't support batching + // (i.e. max_batch_size == 0). If max_batch_size is exceeded then + // scheduler has done something badly wrong so fail and release all + // requests. + if (!all_response_failed) { + if ((total_batch_size != 1) && + (total_batch_size > (size_t)max_batch_size)) { + RESPOND_ALL_AND_SET_TRUE_IF_ERROR( + responses, request_count, all_response_failed, + TRITONSERVER_ErrorNew( + TRITONSERVER_ERROR_INTERNAL, + std::string( + "batch size " + std::to_string(total_batch_size) + " for '" + + Name() + "', max allowed is " + + std::to_string(max_batch_size)) + .c_str())); + } + } + + std::vector input_names; + std::vector input_tensors; + bool cuda_copy = false; + std::unique_ptr collector; + + // For 'KIND_MODEL', it's fine to use CUDA events to calculate the compute + // input duration since only one stream will be used for input collection. + if ((Kind() == TRITONSERVER_INSTANCEGROUPKIND_GPU) || + ((Kind() == TRITONSERVER_INSTANCEGROUPKIND_MODEL) && (device_cnt_ > 0))) { +#ifdef TRITON_ENABLE_GPU + RESPOND_ALL_AND_SET_TRUE_IF_ERROR( + responses, request_count, all_response_failed, + ConvertCUDAStatusToTritonError( + cudaEventRecord( + compute_input_start_event_, GetCudaStreamByInstanceKind()), + TRITONSERVER_ERROR_INTERNAL, "Failed to record the event.")); +#endif + } + + if (!all_response_failed) { + collector.reset(new BackendInputCollector( + requests, request_count, &responses, + model_state_->TritonMemoryManager(), model_state_->EnablePinnedInput(), + GetCudaStreamByInstanceKind(), nullptr, nullptr, 0, + HostPolicyName().c_str())); + RESPOND_ALL_AND_SET_TRUE_IF_ERROR( + responses, request_count, all_response_failed, + SetInputTensors( + total_batch_size, requests, request_count, &responses, + collector.get(), &input_names, &input_tensors, &cuda_copy)); + } + +#ifdef TRITON_ENABLE_GPU + if (cuda_copy) { + cudaStreamSynchronize(GetCudaStreamByInstanceKind()); + cuda_copy = false; + } +#endif + + std::vector output_tensors; + uint64_t compute_start_ns = 0; + uint64_t compute_infer_start = 0; + + RESPOND_ALL_AND_SET_TRUE_IF_ERROR( + responses, request_count, all_response_failed, + RecordBackendTimestamp( + &compute_start_ns, + reinterpret_cast(&compute_infer_start_event_))); + + // For 'KIND_MODEL', capture the timestamp for the compute infer duration. + if ((Kind() == TRITONSERVER_INSTANCEGROUPKIND_MODEL) && (device_cnt_ > 0)) { + SET_TIMESTAMP(compute_infer_start); + } + + // Run... + if (!all_response_failed) { + Execute(&responses, request_count, &input_tensors, &output_tensors); + } + + // Verify output indices are valid with number of outputs after execution + bool invalid_index = false; + int max_index = output_tensors.size() - 1; + + if (!all_response_failed) { + for (const auto& name : model_state_->ModelOutputs()) { + int op_index = output_index_map_[name.first]; + if ((op_index < 0) || (op_index > max_index)) { + RESPOND_ALL_AND_SET_TRUE_IF_ERROR( + responses, request_count, all_response_failed, + TRITONSERVER_ErrorNew( + TRITONSERVER_ERROR_INVALID_ARG, + std::string( + "The output " + std::string(name.first) + + " in the model configuration refers to an output index " + "which doesn't exist. This model has " + + std::to_string(max_index + 1) + " outputs") + .c_str())); + invalid_index = true; + break; + } + } + } + +#ifdef TRITON_ENABLE_GPU + if (Kind() == TRITONSERVER_INSTANCEGROUPKIND_MODEL) { + // For 'KIND_MODEL', multiple streams will be involved, so we need to call + // 'cudaStreamSynchronize' before reading the output tensors. + for (auto& stream : stream_vec_) { + cudaStreamSynchronize(stream); + } + } +#endif + + uint64_t compute_end_ns = 0; + uint64_t compute_output_start = 0; + + if ((Kind() == TRITONSERVER_INSTANCEGROUPKIND_MODEL) && (device_cnt_ > 0)) { +#ifdef TRITON_ENABLE_GPU + SET_TIMESTAMP(compute_output_start); +#endif + } else { + RESPOND_ALL_AND_SET_TRUE_IF_ERROR( + responses, request_count, all_response_failed, + RecordBackendTimestamp( + &compute_end_ns, + reinterpret_cast(&compute_output_start_event_))); + } + + if (!all_response_failed) { + if (!invalid_index) { + RESPOND_ALL_AND_SET_TRUE_IF_ERROR( + responses, request_count, all_response_failed, + ReadOutputTensors( + total_batch_size, output_tensors, requests, request_count, + &responses)); + } + } + + uint64_t exec_end_ns = 0; + SET_TIMESTAMP(exec_end_ns); + + // Send all the responses that haven't already been sent because of + // an earlier error. Note that the responses are not set to nullptr + // here as we need that indication below to determine if the request + // we successful or not. + for (auto& response : responses) { + if (response != nullptr) { + LOG_IF_ERROR( + TRITONBACKEND_ResponseSend( + response, TRITONSERVER_RESPONSE_COMPLETE_FINAL, nullptr), + "failed to send PyTorch backend response"); + } + } + + // We don't need an explicit CUDA syncrhonization here since we have already + // synchronized the stream in the ReadOutputTensors function. + if (Kind() == TRITONSERVER_INSTANCEGROUPKIND_GPU) { +#ifdef TRITON_ENABLE_GPU + float compute_input_duration = GetCudaEventElapsedTime( + compute_input_start_event_, compute_infer_start_event_); + float compute_infer_duration = GetCudaEventElapsedTime( + compute_infer_start_event_, compute_output_start_event_); + + compute_start_ns = exec_start_ns + (compute_input_duration * 1e6); + compute_end_ns = compute_start_ns + (compute_infer_duration * 1e6); +#endif + } else if ( + (Kind() == TRITONSERVER_INSTANCEGROUPKIND_MODEL) && (device_cnt_ > 0)) { +#ifdef TRITON_ENABLE_GPU + float compute_input_duration = GetCudaEventElapsedTime( + compute_input_start_event_, compute_infer_start_event_); + uint64_t compute_infer_duration = + compute_output_start - compute_infer_start; + + compute_start_ns = exec_start_ns + (compute_input_duration * 1e6); + compute_end_ns = compute_start_ns + compute_infer_duration; +#endif + } + + // Report statistics for each request. + for (uint32_t r = 0; r < request_count; ++r) { + auto& request = requests[r]; + LOG_IF_ERROR( + TRITONBACKEND_ModelInstanceReportStatistics( + TritonModelInstance(), request, + (responses[r] != nullptr) /* success */, exec_start_ns, + compute_start_ns, compute_end_ns, exec_end_ns), + "failed reporting request statistics"); + + LOG_IF_ERROR( + TRITONBACKEND_RequestRelease(request, TRITONSERVER_REQUEST_RELEASE_ALL), + "failed releasing request"); + } + + if (!all_response_failed) { + // Report the entire batch statistics. + LOG_IF_ERROR( + TRITONBACKEND_ModelInstanceReportBatchStatistics( + TritonModelInstance(), total_batch_size, exec_start_ns, + compute_start_ns, compute_end_ns, exec_end_ns), + "failed reporting batch request statistics"); + } +} + +TRITONSERVER_Error* +ModelInstanceState::ReadOutputTensors( + size_t total_batch_size, + const std::vector& output_tensors, + TRITONBACKEND_Request** requests, const uint32_t request_count, + std::vector* responses) +{ + NVTX_RANGE(nvtx_, "ReadOutputTensors " + Name()); + + BackendOutputResponder responder( + requests, request_count, responses, model_state_->TritonMemoryManager(), + model_state_->MaxBatchSize() > 0, model_state_->EnablePinnedInput(), + GetCudaStreamByInstanceKind()); + + bool cuda_copy = false; + // The serialized string buffer must be valid until output copies are done + std::vector> string_buffer; + for (auto& output : model_state_->ModelOutputs()) { + int op_index = output_index_map_[output.first]; + auto name = output.first; + auto output_tensor_pair = output.second; + + if (output_tensors[op_index].isTensor()) { + torch::Tensor output_flat; + try { + output_flat = + output_tensors[op_index].toTensor().contiguous().flatten(); + } + catch (std::exception& ex) { + RETURN_IF_ERROR(TRITONSERVER_ErrorNew( + TRITONSERVER_ERROR_INTERNAL, + (std::string("output tensor '") + name + "' is not found") + .c_str())); + } + + // Verify output datatype matches datatype from model config + TRITONSERVER_DataType output_dtype = + ConvertTorchTypeToDataType(output_flat.scalar_type()); + TRITONSERVER_DataType config_datatype = output_dtype_map_[name]; + if (config_datatype != output_dtype) { + RETURN_IF_ERROR(TRITONSERVER_ErrorNew( + TRITONSERVER_ERROR_INVALID_ARG, + (std::string("configuration expects datatype TYPE_") + + TRITONSERVER_DataTypeString(config_datatype) + " for output '" + + name + "', model provides TYPE_" + + TRITONSERVER_DataTypeString(output_dtype)) + .c_str())); + } + + const char* output_buffer = + static_cast(output_flat.data_ptr()); + + // Output tensors may not reside on the same device as model + torch::Device tensor_device = output_flat.device(); + const auto memory_type = (tensor_device.type() == torch::kCPU) + ? TRITONSERVER_MEMORY_CPU + : TRITONSERVER_MEMORY_GPU; + const auto memory_id = + (tensor_device.type() == torch::kCPU) ? 0 : tensor_device.index(); + + // Batch output doesn't support string data type yet, as it is not trivial + // to parse string output + const BatchOutput* batch_output = StateForModel()->FindBatchOutput(name); + if (batch_output == nullptr) { + // Get output shape + std::vector batchn_shape; + auto shape = output_tensors[op_index].toTensor().sizes(); + for (auto itr = shape.begin(); itr != shape.end(); itr++) { + batchn_shape.push_back(*itr); + } + + if (batchn_shape.size() == 0) { + return TRITONSERVER_ErrorNew( + TRITONSERVER_ERROR_INVALID_ARG, + (std::string("output '") + name + + "' is a scalar which is not supported.") + .c_str()); + } + if (output_tensor_pair.first != -1) { + responder.ProcessTensor( + name, output_dtype, batchn_shape, output_buffer, memory_type, + memory_id); + } + if (output_tensor_pair.second != -1) { + std::vector states; + states = responder.ProcessStateTensor( + name, output_dtype, batchn_shape, output_buffer, memory_type, + memory_id); + // Update the states + for (auto& state : states) { + RETURN_IF_ERROR(TRITONBACKEND_StateUpdate(state)); + } + } + + } else { + responder.ProcessBatchOutput( + name, *batch_output, output_buffer, memory_type, memory_id); + } + } else if (output_tensors[op_index].isList()) { + // Custom handling for string/bytes tensor... + torch::List output_list = + output_tensors[op_index].toList(); + + // Get output shape + std::vector batchn_shape{(int64_t)output_list.size()}; + + for (size_t idx = 0; idx < responses->size(); idx++) { + auto& request = requests[idx]; + auto& response = (*responses)[idx]; + + if (supports_batching_ != 0) { + TRITONBACKEND_Input* input; + TRITONBACKEND_RequestInputByIndex(request, 0 /* index*/, &input); + const int64_t* shape; + TRITONBACKEND_InputProperties( + input, nullptr, nullptr, &shape, nullptr, nullptr, nullptr); + batchn_shape[0] = shape[0]; + } + + int64_t tensor_element_cnt = 0; + RETURN_IF_ERROR(GetElementCount(batchn_shape, &tensor_element_cnt)); + + // Only need an response tensor for requested outputs. + if (response != nullptr) { + if (output_tensor_pair.first != -1) { + TRITONBACKEND_Output* response_output; + RESPOND_AND_SET_NULL_IF_ERROR( + &response, TRITONBACKEND_ResponseOutput( + response, &response_output, name.c_str(), + TRITONSERVER_TYPE_BYTES, batchn_shape.data(), + batchn_shape.size())); + string_buffer.emplace_back(new std::string()); + cuda_copy |= SetStringOutputBuffer( + &output_list, &response, response_output, tensor_element_cnt, + GetCudaStreamByInstanceKind(), string_buffer.back().get()); + } + } + if (output_tensor_pair.second != -1) { + TRITONBACKEND_State* response_state; + RESPOND_AND_SET_NULL_IF_ERROR( + &response, TRITONBACKEND_StateNew( + &response_state, request, name.c_str(), + TRITONSERVER_TYPE_BYTES, batchn_shape.data(), + batchn_shape.size())); + + string_buffer.emplace_back(new std::string()); + cuda_copy |= SetStringStateBuffer( + &output_list, &response, response_state, tensor_element_cnt, + GetCudaStreamByInstanceKind(), string_buffer.back().get()); + } + } + } else { + return TRITONSERVER_ErrorNew( + TRITONSERVER_ERROR_INVALID_ARG, + (std::string("output '") + name + + "' must be of type Tensor or List[str].") + .c_str()); + } + } + + // Finalize and wait for any pending buffer copies. + cuda_copy |= responder.Finalize(); + +#ifdef TRITON_ENABLE_GPU + // We have to always synchronize the stream. This is to make sure that + // the events on the cuda stream are synchronized. Otherwise, the events + // are only guaranteed to be synchronized if the model provides the output + // on GPU. + cudaStreamSynchronize(GetCudaStreamByInstanceKind()); +#endif + + return nullptr; +} + +TRITONSERVER_Error* +ModelInstanceState::RecordBackendTimestamp( + uint64_t* timestamp, void* cuda_event) +{ + if ((Kind() == TRITONSERVER_INSTANCEGROUPKIND_GPU) || + ((Kind() == TRITONSERVER_INSTANCEGROUPKIND_MODEL) && (device_cnt_ > 0))) { +#ifdef TRITON_ENABLE_GPU + cudaEvent_t* lcuda_event = reinterpret_cast(cuda_event); + RETURN_IF_ERROR(ConvertCUDAStatusToTritonError( + cudaEventRecord(*lcuda_event, GetCudaStreamByInstanceKind()), + TRITONSERVER_ERROR_INTERNAL, "Failed to record the event.")); +#endif + } else { + SET_TIMESTAMP(*timestamp); + } + return nullptr; +} + +void +ModelInstanceState::SetCurrentCudaStream( + const cudaStream_t& stream, const int& device_id) +{ +#ifdef TRITON_ENABLE_GPU + at::cuda::CUDAStream torch_stream = + at::cuda::getStreamFromExternal(stream, device_id); + // This function replaces the default stream with the stream we created. It + // is not necessary to change the current device to the desired device when + // replacing the default stream for that device. See the documentation here: + // https://pytorch.org/cppdocs/api/function_namespacec10_1_1cuda_1a6ed50cc0fc16cc7014d9c2f4c3bd098d.html + at::cuda::setCurrentCUDAStream(torch_stream); +#endif +} + +TRITONSERVER_Error* +ModelInstanceState::SetInputTensors( + size_t total_batch_size, TRITONBACKEND_Request** requests, + const uint32_t request_count, + std::vector* responses, + BackendInputCollector* collector, std::vector* input_names, + std::vector* input_tensors, bool* cuda_copy) +{ + // InferenceMode should be used to guard all tensors operations + torch::InferenceMode infer_guard(model_state_->EnabledInferenceMode()); + + // All requests must have equally-sized input tensors so use any + // request as the representative for the input tensors. + uint32_t input_count; + RETURN_IF_ERROR(TRITONBACKEND_RequestInputCount(requests[0], &input_count)); + + input_tensors->resize(input_count + batch_input_count_); + + // The inputs must be in contiguous CPU/GPU memory. + std::vector> alloc_perference; + if (device_.is_cpu()) { + alloc_perference = { + {TRITONSERVER_MEMORY_CPU_PINNED, 0}, {TRITONSERVER_MEMORY_CPU, 0}}; + } else { + alloc_perference = {{TRITONSERVER_MEMORY_GPU, device_.index()}}; + } + + for (uint32_t input_idx = 0; input_idx < input_count; input_idx++) { + TRITONBACKEND_Input* input; + RETURN_IF_ERROR( + TRITONBACKEND_RequestInputByIndex(requests[0], input_idx, &input)); + + const char* input_name; + TRITONSERVER_DataType input_datatype; + const int64_t* input_shape; + uint32_t input_dims_count; + RETURN_IF_ERROR(TRITONBACKEND_InputProperties( + input, &input_name, &input_datatype, &input_shape, &input_dims_count, + nullptr, nullptr)); + + input_names->emplace_back(input_name); + + // The shape for the entire input patch, + // [total_batch_size, ...] for non-ragged input and + // [total_element_count] for ragged input (non-nested tensor) + std::vector batchn_shape; + if (StateForModel()->IsInputRagged(input_name)) { + batchn_shape = std::vector{0}; + for (size_t idx = 0; idx < request_count; idx++) { + TRITONBACKEND_Input* input; + RESPOND_AND_SET_NULL_IF_ERROR( + &((*responses)[idx]), + TRITONBACKEND_RequestInput(requests[idx], input_name, &input)); + const int64_t* input_shape; + uint32_t input_dims_count; + RESPOND_AND_SET_NULL_IF_ERROR( + &((*responses)[idx]), TRITONBACKEND_InputProperties( + input, nullptr, nullptr, &input_shape, + &input_dims_count, nullptr, nullptr)); + + int64_t element_cnt = 0; + RESPOND_AND_SET_NULL_IF_ERROR( + &((*responses)[idx]), + GetElementCount(input_shape, input_dims_count, &element_cnt)); + batchn_shape[0] += element_cnt; + } + } else { + batchn_shape = + std::vector(input_shape, input_shape + input_dims_count); + if (supports_batching_) { + batchn_shape[0] = total_batch_size; + } + } + + // The input must be in contiguous CPU/GPU memory. + std::vector> alloc_perference; + // For 'KIND_MODEL', input will always be in CPU as we don't have a way to + // query the input types. + if (device_.is_cpu() || (Kind() == TRITONSERVER_INSTANCEGROUPKIND_MODEL)) { + alloc_perference = { + {TRITONSERVER_MEMORY_CPU_PINNED, 0}, {TRITONSERVER_MEMORY_CPU, 0}}; + } else { + alloc_perference = {{TRITONSERVER_MEMORY_GPU, device_.index()}}; + } + + const char* input_buffer; + size_t batchn_byte_size; + TRITONSERVER_MemoryType memory_type; + int64_t memory_type_id; + RETURN_IF_ERROR(collector->ProcessTensor( + input_name, nullptr, 0, alloc_perference, &input_buffer, + &batchn_byte_size, &memory_type, &memory_type_id)); + + // Create Torch tensor + const auto torch_dtype = ConvertDataTypeToTorchType(input_datatype); + torch::TensorOptions options{torch_dtype.second}; + auto updated_options = (memory_type == TRITONSERVER_MEMORY_GPU) + ? options.device(torch::kCUDA, device_.index()) + : options.device(torch::kCPU); + + if (input_datatype == TRITONSERVER_TYPE_BYTES) { + // Create the PyTorch list to hold the strings. + torch::List input_list; + input_list.reserve(batchn_shape[0]); + + for (size_t idx = 0; idx < request_count; idx++) { + TRITONBACKEND_Input* input; + RESPOND_AND_SET_NULL_IF_ERROR( + &((*responses)[idx]), + TRITONBACKEND_RequestInput(requests[idx], input_name, &input)); + const int64_t* shape; + uint32_t dims_count; + uint32_t buffer_count; + RESPOND_AND_SET_NULL_IF_ERROR( + &((*responses)[idx]), + TRITONBACKEND_InputPropertiesForHostPolicy( + input, HostPolicyName().c_str(), nullptr, nullptr, &shape, + &dims_count, nullptr, &buffer_count)); + + int64_t batch_element_cnt = 0; + RESPOND_AND_SET_NULL_IF_ERROR( + &((*responses)[idx]), + GetElementCount(shape, dims_count, &batch_element_cnt)); + + *cuda_copy |= SetStringInputTensor( + &input_list, input, input_name, buffer_count, batch_element_cnt, + &((*responses)[idx]), GetCudaStreamByInstanceKind(), + HostPolicyName().c_str()); + } + + (*input_tensors)[input_index_map_[input_name]] = input_list; + } else { + if (batchn_byte_size) { + // Remove constness to align with the signature of torch::from_blob() + torch::Tensor input_tensor = torch::from_blob( + const_cast(input_buffer), batchn_shape, updated_options); + (*input_tensors)[input_index_map_[input_name]] = input_tensor; + } else { + // torch:from_blob seems not working when the input size is 0 + // create zero-length inputs directly + torch::Tensor input_tensor = + torch::zeros(batchn_shape, updated_options); + (*input_tensors)[input_index_map_[input_name]] = input_tensor; + } + } + } + + for (const auto& batch_input : StateForModel()->BatchInputs()) { + std::vector shape; + collector->BatchInputShape(batch_input, &shape); + + for (const auto& input_name : batch_input.TargetNames()) { + input_names->emplace_back(input_name.c_str()); + + const char* dst_buffer; + size_t dst_buffer_byte_size; + TRITONSERVER_MemoryType dst_memory_type; + int64_t dst_memory_type_id; + + RESPOND_ALL_AND_SET_NULL_IF_ERROR( + (*responses), responses->size(), + collector->ProcessBatchInput( + batch_input, nullptr, 0, alloc_perference, &dst_buffer, + &dst_buffer_byte_size, &dst_memory_type, &dst_memory_type_id)); + + const auto torch_dtype = + ConvertDataTypeToTorchType(batch_input.DataType()); + torch::TensorOptions options{torch_dtype.second}; + auto updated_options = (dst_memory_type == TRITONSERVER_MEMORY_GPU) + ? options.device(torch::kCUDA, device_.index()) + : options.device(torch::kCPU); + + if (dst_buffer_byte_size) { + torch::Tensor input_tensor = torch::from_blob( + const_cast(dst_buffer), shape, updated_options); + (*input_tensors)[input_index_map_[input_name]] = input_tensor; + } else { + // special handle when input has zero size + torch::Tensor input_tensor = torch::zeros(shape, updated_options); + (*input_tensors)[input_index_map_[input_name]] = input_tensor; + } + } + } + + // Finalize... + *cuda_copy |= collector->Finalize(); + + return nullptr; +} + +TRITONSERVER_Error* +ModelInstanceState::ValidateBooleanSequenceControl( + triton::common::TritonJson::Value& sequence_batching, + const std::string& control_kind, bool required, bool* have_control) +{ + std::string tensor_name; + std::string tensor_datatype; + RETURN_IF_ERROR(GetBooleanSequenceControlProperties( + sequence_batching, model_state_->Name(), control_kind, required, + &tensor_name, &tensor_datatype, nullptr, nullptr, nullptr, nullptr, + nullptr, nullptr)); + *have_control = !tensor_name.empty(); + if (*have_control) { + std::string deliminator = "__"; + int ip_index = 0; + int start_pos = tensor_name.find(deliminator); + if (start_pos == -1) { + return TRITONSERVER_ErrorNew( + TRITONSERVER_ERROR_INTERNAL, + ("input '" + tensor_name + + "' does not follow __ naming convention.") + .c_str()); + } + + // check if the index part of the name is not an integer + std::string index_str = tensor_name.substr(start_pos + 2); + for (auto itr = index_str.begin(); itr != index_str.end(); itr++) { + if (std::isdigit(*itr) == 0) { + return TRITONSERVER_ErrorNew( + TRITONSERVER_ERROR_INTERNAL, + ("input '" + tensor_name + + "' does not follow __ naming convention.") + .c_str()); + } + } + + ip_index = std::atoi(tensor_name.substr(start_pos + 2).c_str()); + input_index_map_[tensor_name] = ip_index; + } + + return nullptr; // success +} + +TRITONSERVER_Error* +ModelInstanceState::ValidateInputs(const size_t expected_input_cnt) +{ + // Collect all the expected input tensor names and validate that the model + // configuration specifies only those. + std::vector allowed_inputs; + + const torch::jit::Method& method = torch_model_->get_method("forward"); + const auto& schema = method.function().getSchema(); + const std::vector& arguments = schema.arguments(); + + // Currently, only models with a single input of type Dict(str, Tensor) are + // supported. If the model expects more than one input then they must be all + // be of type Tensor. + // + // Ignore the argument at idx 0 if it is of Class type (self param in forward + // function) + size_t start_idx = 0; + if ((arguments.size() > 0) && + (arguments.at(0).type()->kind() == c10::TypeKind::ClassType)) { + start_idx = 1; + } + if ((arguments.size() == (1 + start_idx)) && + (arguments.at(start_idx).type()->kind() == c10::TypeKind::DictType)) { + is_dict_input_ = true; + } else if (arguments.size() > start_idx) { + // Return error if multiple inputs are of kind DictType + for (size_t i = start_idx + 1; i < arguments.size(); i++) { + if (arguments.at(i).type()->kind() == c10::TypeKind::DictType) { + return TRITONSERVER_ErrorNew( + TRITONSERVER_ERROR_INTERNAL, + "Multiple inputs of kind DictType were detected. Only a single " + "input of type Dict(str, Tensor) is supported."); + } + } + + // Return error if all inputs are not of type Tensor + for (size_t i = start_idx; i < arguments.size(); i++) { + if ((arguments.at(i).type()->kind() != c10::TypeKind::TensorType) && + (arguments.at(i).type()->kind() != c10::TypeKind::ListType)) { + return TRITONSERVER_ErrorNew( + TRITONSERVER_ERROR_INTERNAL, + (std::string("An input of type '") + arguments.at(i).type()->str() + + "' was detected in the model. Only a single input of type " + "Dict(str, Tensor) or input(s) of type Tensor are supported.") + .c_str()); + } + allowed_inputs.emplace_back(arguments.at(i).name()); + } + + // If all inputs are tensors, match number of expected inputs between model + // and configuration + if ((arguments.size() - start_idx) != expected_input_cnt) { + return TRITONSERVER_ErrorNew( + TRITONSERVER_ERROR_INVALID_ARG, + (std::string("unable to load model '") + model_state_->Name() + + "', configuration expects " + std::to_string(expected_input_cnt) + + " inputs, model provides " + + std::to_string(arguments.size() - start_idx)) + .c_str()); + } + } + + triton::common::TritonJson::Value ios; + RETURN_IF_ERROR(model_state_->ModelConfig().MemberAsArray("input", &ios)); + + if (ios.ArraySize() == 0) { + return TRITONSERVER_ErrorNew( + TRITONSERVER_ERROR_INTERNAL, + "model configuration must contain at least one input, none were " + "specified."); + } + + NamingConvention naming_convention; + RETURN_IF_ERROR(GetNamingConvention(&naming_convention, allowed_inputs)); + + for (size_t i = 0; i < ios.ArraySize(); i++) { + triton::common::TritonJson::Value io; + RETURN_IF_ERROR(ios.IndexAsObject(i, &io)); + + // Validate name + std::string io_name; + RETURN_IF_ERROR(io.MemberAsString("name", &io_name)); + AddInputToMap(naming_convention, allowed_inputs, io_name, i); + // Validate data type + std::string io_dtype; + RETURN_IF_ERROR(io.MemberAsString("data_type", &io_dtype)); + const auto pr = ModelConfigDataTypeToTorchType(io_dtype); + if (!pr.first && (io_dtype != "TYPE_STRING")) { + return TRITONSERVER_ErrorNew( + TRITONSERVER_ERROR_INTERNAL, + ("unsupported datatype " + io_dtype + " for input '" + io_name + + "' for model '" + model_state_->Name() + "'") + .c_str()); + } + + // Validate shape for String inputs. Only allow 1 dimension. + if (io_dtype == "TYPE_STRING") { + // If a reshape is provided for the input then use that when + // validating the model shapes. + std::vector dims; + triton::common::TritonJson::Value reshape; + if (io.Find("reshape", &reshape)) { + RETURN_IF_ERROR(ParseShape(reshape, "shape", &dims)); + } else { + RETURN_IF_ERROR(ParseShape(io, "dims", &dims)); + } + + if ((dims.size() + (supports_batching_ ? 1 : 0)) > 1) { + return TRITONSERVER_ErrorNew( + TRITONSERVER_ERROR_INTERNAL, + ("Triton only supports 1 dimensional List of String as input for " + "'" + + std::string(io_name) + "' for model '" + model_state_->Name() + + "'") + .c_str()); + } + } + } + triton::common::TritonJson::Value sequence_batching; + if (model_state_->ModelConfig().Find( + "sequence_batching", &sequence_batching)) { + triton::common::TritonJson::Value states; + if (sequence_batching.Find("state", &states)) { + for (size_t i = 0; i < states.ArraySize(); i++) { + triton::common::TritonJson::Value state; + RETURN_IF_ERROR(states.IndexAsObject(i, &state)); + std::string state_name; + RETURN_IF_ERROR(state.MemberAsString("input_name", &state_name)); + AddInputToMap(naming_convention, allowed_inputs, state_name, i); + + // Validate data type + std::string state_dtype; + RETURN_IF_ERROR(state.MemberAsString("data_type", &state_dtype)); + const auto pr = ModelConfigDataTypeToTorchType(state_dtype); + if (!pr.first && (state_dtype != "TYPE_STRING")) { + return TRITONSERVER_ErrorNew( + TRITONSERVER_ERROR_INTERNAL, + ("unsupported datatype " + state_dtype + " for input state '" + + state_name + "' for model '" + model_state_->Name() + "'") + .c_str()); + } + + // Validate shape for String inputs. Only allow 1 dimension. + if (state_dtype == "TYPE_STRING") { + std::vector dims; + if ((dims.size() + (supports_batching_ ? 1 : 0)) > 1) { + return TRITONSERVER_ErrorNew( + TRITONSERVER_ERROR_INTERNAL, + ("Triton only supports 1 dimensional List of String as input " + "for " + "'" + + std::string(state_name) + "' for model '" + + model_state_->Name() + "'") + .c_str()); + } + } + } + } + } + + triton::common::TritonJson::Value batch_inputs; + RETURN_IF_ERROR( + model_state_->ModelConfig().MemberAsArray("batch_input", &batch_inputs)); + size_t i = 0; + for (const auto& batch_input : StateForModel()->BatchInputs()) { + for (const auto& input_name : batch_input.TargetNames()) { + AddInputToMap( + naming_convention, allowed_inputs, input_name, i + ios.ArraySize()); + i++; + } + } + + return nullptr; // success +} + +TRITONSERVER_Error* +ModelInstanceState::ValidateOutputs() +{ + triton::common::TritonJson::Value ios; + RETURN_IF_ERROR(model_state_->ModelConfig().MemberAsArray("output", &ios)); + std::string deliminator = "__"; + int op_index = 0; + + if (ios.ArraySize() == 0) { + return TRITONSERVER_ErrorNew( + TRITONSERVER_ERROR_INTERNAL, + "model configuration must contain at least one output, none were " + "specified."); + } + + NamingConvention naming_convention; + RETURN_IF_ERROR(GetNamingConvention(&naming_convention, {})); + + for (size_t i = 0; i < ios.ArraySize(); i++) { + triton::common::TritonJson::Value io; + RETURN_IF_ERROR(ios.IndexAsObject(i, &io)); + + // Validate name + std::string io_name; + RETURN_IF_ERROR(io.MemberAsString("name", &io_name)); + switch (naming_convention) { + case NamingConvention::NAMED_INDEX: { + int start_pos = io_name.find(deliminator); + op_index = std::atoi(io_name.substr(start_pos + 2).c_str()); + break; + } + case NamingConvention::STRICT_CONFIG_ORDERING: { + op_index = i; + break; + } + default: + break; + } + + // Validate data type + std::string io_dtype; + RETURN_IF_ERROR(io.MemberAsString("data_type", &io_dtype)); + const auto pr = ModelConfigDataTypeToTorchType(io_dtype); + if (!pr.first && (io_dtype != "TYPE_STRING")) { + return TRITONSERVER_ErrorNew( + TRITONSERVER_ERROR_INTERNAL, + ("unsupported datatype " + io_dtype + " for output '" + io_name + + "' for model '" + model_state_->Name() + "'") + .c_str()); + } + + // Validate shape for String outputs. Only allow 1 dimension. + if (io_dtype == "TYPE_STRING") { + // If a reshape is provided for the output then use that when + // validating the model shapes. + std::vector dims; + triton::common::TritonJson::Value reshape; + if (io.Find("reshape", &reshape)) { + RETURN_IF_ERROR(ParseShape(reshape, "shape", &dims)); + } else { + RETURN_IF_ERROR(ParseShape(io, "dims", &dims)); + } + + if ((dims.size() + (supports_batching_ ? 1 : 0)) > 1) { + return TRITONSERVER_ErrorNew( + TRITONSERVER_ERROR_INTERNAL, + ("Triton only supports 1 dimensional List of String as output for " + "'" + + std::string(io_name) + "' for model '" + model_state_->Name() + + "'") + .c_str()); + } + } + + output_index_map_[io_name] = op_index; + output_dtype_map_[io_name] = ConvertTorchTypeToDataType(pr.second); + } + + triton::common::TritonJson::Value sequence_batching; + if (model_state_->ModelConfig().Find( + "sequence_batching", &sequence_batching)) { + triton::common::TritonJson::Value states; + if (sequence_batching.Find("state", &states)) { + for (size_t i = 0; i < states.ArraySize(); i++) { + triton::common::TritonJson::Value state; + RETURN_IF_ERROR(states.IndexAsObject(i, &state)); + std::string state_name; + RETURN_IF_ERROR(state.MemberAsString("output_name", &state_name)); + std::string state_dtype; + RETURN_IF_ERROR(state.MemberAsString("data_type", &state_dtype)); + std::vector dims; + RETURN_IF_ERROR(ParseShape(state, "dims", &dims)); + + // For state, naming convention is enforced to be NAMED_INDEX + int start_pos = state_name.find(deliminator); + op_index = std::atoi(state_name.substr(start_pos + 2).c_str()); + + const auto pr = ModelConfigDataTypeToTorchType(state_dtype); + if (!pr.first && (state_dtype != "TYPE_STRING")) { + return TRITONSERVER_ErrorNew( + TRITONSERVER_ERROR_INTERNAL, + ("unsupported datatype " + state_dtype + " for state '" + + state_name + "' for model '" + model_state_->Name() + "'") + .c_str()); + } + + // Validate shape for String outputs. Only allow 1 dimension. + if (state_dtype == "TYPE_STRING") { + if ((dims.size() + (supports_batching_ ? 1 : 0)) > 1) { + return TRITONSERVER_ErrorNew( + TRITONSERVER_ERROR_INTERNAL, + ("Triton only supports 1 dimensional List of String as output " + "for " + "'" + + std::string(state_name) + "' for model '" + + model_state_->Name() + "'") + .c_str()); + } + } + + output_index_map_[state_name] = op_index; + output_dtype_map_[state_name] = ConvertTorchTypeToDataType(pr.second); + } + } + } + + return nullptr; // success +} + +TRITONSERVER_Error* +ModelInstanceState::ValidateTypedSequenceControl( + triton::common::TritonJson::Value& sequence_batching, + const std::string& control_kind, bool required, bool* have_control) +{ + std::string tensor_name; + std::string tensor_datatype; + RETURN_IF_ERROR(GetTypedSequenceControlProperties( + sequence_batching, model_state_->Name(), control_kind, required, + &tensor_name, &tensor_datatype)); + *have_control = !tensor_name.empty(); + if (*have_control) { + std::string deliminator = "__"; + int ip_index = 0; + int start_pos = tensor_name.find(deliminator); + if (start_pos == -1) { + return TRITONSERVER_ErrorNew( + TRITONSERVER_ERROR_INTERNAL, + ("input '" + tensor_name + + "' does not follow __ naming convention.") + .c_str()); + } + + // check if the index part of the name is not an integer + std::string index_str = tensor_name.substr(start_pos + 2); + for (auto itr = index_str.begin(); itr != index_str.end(); itr++) { + if (std::isdigit(*itr) == 0) { + return TRITONSERVER_ErrorNew( + TRITONSERVER_ERROR_INTERNAL, + ("input '" + tensor_name + + "' does not follow __ naming convention.") + .c_str()); + } + } + + // check if the data type is supported by PyTorch + if (!ModelConfigDataTypeToTorchType(tensor_datatype).first) { + return TRITONSERVER_ErrorNew( + TRITONSERVER_ERROR_INTERNAL, + ("input '" + tensor_name + "' type '" + tensor_datatype + + "' is not supported by PyTorch.") + .c_str()); + } + + ip_index = std::atoi(tensor_name.substr(start_pos + 2).c_str()); + input_index_map_[tensor_name] = ip_index; + } + + return nullptr; // success +} + + +} // namespace triton::backend::pytorch diff --git a/src/model_instance_state.hh b/src/model_instance_state.hh new file mode 100644 index 0000000..b495510 --- /dev/null +++ b/src/model_instance_state.hh @@ -0,0 +1,178 @@ +// Copyright 2019-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions +// are met: +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// * Neither the name of NVIDIA CORPORATION nor the names of its +// contributors may be used to endorse or promote products derived +// from this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +#pragma once + +#include + +#include +#include +#include +#include +#include + +#include "libtorch_utils.h" +#include "model_state.hh" +#include "naming_convention.hh" +#include "triton/backend/backend_common.h" +#include "triton/backend/backend_input_collector.h" +#include "triton/backend/backend_memory.h" +#include "triton/backend/backend_model.h" +#include "triton/backend/backend_model_instance.h" +#include "triton/backend/backend_output_responder.h" +#include "triton/common/nvtx.h" +#include "triton/core/tritonbackend.h" + + +namespace triton::backend::pytorch { + +// +// ModelInstanceState +// +// State associated with a model instance. An object of this class is +// created and associated with each TRITONBACKEND_ModelInstance. +// +class ModelInstanceState : public BackendModelInstance { + private: + ModelState* model_state_; + + // The full path to the TorchScript model file. + std::string model_path_; + + std::shared_ptr torch_model_; + torch::Device device_; + + // Map from configuration name for an input to the index of + // that input in the model. + std::unordered_map input_index_map_; + uint32_t batch_input_count_ = 0; + + // Map from configuration name for an output to the index of + // that output in the model. + std::unordered_map output_index_map_; + std::unordered_map output_dtype_map_; + + // If the input to the tensor is a dictionary of tensors. + bool is_dict_input_; + + // If the model supports batching. + bool supports_batching_; + + cudaEvent_t compute_input_start_event_; + cudaEvent_t compute_infer_start_event_; + cudaEvent_t compute_output_start_event_; + + // Store the cuda streams created for the 'KIND_MODEL' instance group. + std::vector stream_vec_; + + // The number of available devices. + int device_cnt_; + + public: + virtual ~ModelInstanceState(); + + // Clear CUDA cache + void ClearCache(); + + static TRITONSERVER_Error* Create( + ModelState* model_state, + TRITONBACKEND_ModelInstance* triton_model_instance, + ModelInstanceState** state); + + // Execute... + void ProcessRequests( + TRITONBACKEND_Request** requests, const uint32_t request_count); + + // Get the state of the model that corresponds to this instance. + ModelState* StateForModel() const; + + private: + ModelInstanceState( + ModelState* model_state, + TRITONBACKEND_ModelInstance* triton_model_instance); + + void AddInputToMap( + NamingConvention naming_convention, + const std::vector allowed_inputs, const std::string& io_name, + const uint32_t index); + + // Create CUDA events for statistics collection. + void CreateCudaEvents(const int32_t& device_id); + + void Execute( + std::vector* responses, + const uint32_t response_count, + std::vector* input_tensors, + std::vector* output_tensors); + + // Get the elapsed time between two CUDA events. + float GetCudaEventElapsedTime( + const cudaEvent_t& start_event, const cudaEvent_t& end_event); + + // Get the appropriate CUDA stream for input and output handling based on + // the instance group type. + cudaStream_t GetCudaStreamByInstanceKind(); + + // Get the naming convention for inputs/outputs from the model configuration + TRITONSERVER_Error* GetNamingConvention( + NamingConvention* naming_convention, + const std::vector& allowed_io); + + TRITONSERVER_Error* ReadOutputTensors( + size_t total_batch_size, + const std::vector& output_tensors, + TRITONBACKEND_Request** requests, const uint32_t request_count, + std::vector* responses); + + TRITONSERVER_Error* RecordBackendTimestamp( + uint64_t* timestamp, void* cuda_event); + + // Replace the default CUDA stream with the stream we created to ensure + // proper cuda stream synchronization. + void SetCurrentCudaStream( + const cudaStream_t& stream, const int32_t& device_id); + + TRITONSERVER_Error* SetInputTensors( + size_t total_batch_size, TRITONBACKEND_Request** requests, + const uint32_t request_count, + std::vector* responses, + BackendInputCollector* collector, std::vector* input_names, + std::vector* input_tensors, bool* cuda_copy); + + TRITONSERVER_Error* ValidateBooleanSequenceControl( + triton::common::TritonJson::Value& sequence_batching, + const std::string& control_kind, bool required, bool* have_control); + + TRITONSERVER_Error* ValidateInputs(const size_t expected_input_cnt); + + TRITONSERVER_Error* ValidateOutputs(); + + TRITONSERVER_Error* ValidateTypedSequenceControl( + triton::common::TritonJson::Value& sequence_batching, + const std::string& control_kind, bool required, bool* have_control); +}; + +} // namespace triton::backend::pytorch diff --git a/src/model_state.cc b/src/model_state.cc new file mode 100644 index 0000000..b007438 --- /dev/null +++ b/src/model_state.cc @@ -0,0 +1,495 @@ +// Copyright 2019-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions +// are met: +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// * Neither the name of NVIDIA CORPORATION nor the names of its +// contributors may be used to endorse or promote products derived +// from this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +#include "model_state.hh" + +#include + + +namespace { +std::once_flag pytorch_interop_threads_flag; +std::once_flag pytorch_intraop_threads_flag; +} // namespace + +namespace triton::backend::pytorch { + +ModelState::ModelState(TRITONBACKEND_Model* triton_model) + : BackendModel(triton_model), enable_optimized_execution_(true), + enable_inference_mode_(true), enable_cudnn_(true), + enable_cache_cleaning_(false), enable_weight_sharing_(false), + enable_tensor_fuser_pair_({false, true}), + enable_jit_profiling_pair_({false, true}), + enable_jit_executor_pair_({false, true}) +{ +} + +TRITONSERVER_Error* +ModelState::AutoCompleteConfig() +{ + // Auto-complete configuration is not supported since PyTorch does not + // store/capture sufficient model metadata so just log error instead. + LOG_MESSAGE( + TRITONSERVER_LOG_WARN, + (std::string("skipping model configuration auto-complete for '") + + Name() + "': not supported for pytorch backend") + .c_str()); + + return nullptr; // success +} + +TRITONSERVER_Error* +ModelState::Create(TRITONBACKEND_Model* triton_model, ModelState** state) +{ + try { + *state = new ModelState(triton_model); + } + catch (const BackendModelException& ex) { + RETURN_ERROR_IF_TRUE( + ex.err_ == nullptr, TRITONSERVER_ERROR_INTERNAL, + std::string("unexpected nullptr in BackendModelException")); + RETURN_IF_ERROR(ex.err_); + } + + // Auto-complete the configuration if requested... + bool auto_complete_config = false; + RETURN_IF_ERROR(TRITONBACKEND_ModelAutoCompleteConfig( + triton_model, &auto_complete_config)); + if (auto_complete_config) { + RETURN_IF_ERROR((*state)->AutoCompleteConfig()); + RETURN_IF_ERROR((*state)->SetModelConfig()); + } + + auto& model_outputs = (*state)->model_outputs_; + // Parse the output states in the model configuration + triton::common::TritonJson::Value sequence_batching; + if ((*state)->ModelConfig().Find("sequence_batching", &sequence_batching)) { + triton::common::TritonJson::Value states; + if (sequence_batching.Find("state", &states)) { + for (size_t i = 0; i < states.ArraySize(); i++) { + triton::common::TritonJson::Value state; + RETURN_IF_ERROR(states.IndexAsObject(i, &state)); + std::string output_state_name; + RETURN_IF_ERROR( + state.MemberAsString("output_name", &output_state_name)); + auto it = model_outputs.find(output_state_name); + if (it == model_outputs.end()) { + model_outputs.insert({output_state_name, std::make_pair(-1, i)}); + } else { + it->second.second = i; + } + } + } + } + + // Parse the output names in the model configuration + triton::common::TritonJson::Value outputs; + RETURN_IF_ERROR((*state)->ModelConfig().MemberAsArray("output", &outputs)); + for (size_t i = 0; i < outputs.ArraySize(); i++) { + triton::common::TritonJson::Value output; + THROW_IF_BACKEND_INSTANCE_ERROR(outputs.IndexAsObject(i, &output)); + + // Use names from ModelConfig by reference since the model + // config will persist longer than this inference execution. + std::string output_name; + THROW_IF_BACKEND_INSTANCE_ERROR( + output.MemberAsString("name", &output_name)); + + auto it = model_outputs.find(output_name); + if (it == model_outputs.end()) { + model_outputs.insert({output_name, std::make_pair(i, -1)}); + } else { + it->second.first = i; + } + } + + RETURN_IF_ERROR((*state)->ParseParameters()); + + return nullptr; // success +} + +bool +ModelState::EnabledCacheCleaning() +{ + return enable_cache_cleaning_; +} + +bool +ModelState::EnabledCudnn() +{ + return enable_cudnn_; +} + +bool +ModelState::EnabledInferenceMode() +{ + return enable_inference_mode_; +} + +const std::pair& +ModelState::EnabledJitExecutor() const +{ + return enable_jit_executor_pair_; +} + +const std::pair& +ModelState::EnabledJitProfiling() const +{ + return enable_jit_profiling_pair_; +} + +bool +ModelState::EnabledOptimizedExecution() +{ + return enable_optimized_execution_; +} + +const std::pair& +ModelState::EnabledTensorExprFuser() const +{ + return enable_tensor_fuser_pair_; +} + +bool +ModelState::EnabledWeightSharing() +{ + return enable_weight_sharing_; +} + +TRITONSERVER_Error* +ModelState::LoadModel( + const std::string& artifact_name, const torch::Device device, + std::string* model_path, const TRITONSERVER_InstanceGroupKind& kind, + std::shared_ptr* torch_model) +{ + // Find the TorchScript file that describes the model. If the model + // configuration doesn't have an explicit model file specified then + // use the default name ("model.pt"). + std::string cc_model_filename = artifact_name; + if (cc_model_filename.empty()) { + cc_model_filename = "model.pt"; + } + + *model_path = JoinPath( + {RepositoryPath(), std::to_string(Version()), cc_model_filename}); + + { + bool exists; + RETURN_IF_ERROR(FileExists(*model_path, &exists)); + RETURN_ERROR_IF_FALSE( + exists, TRITONSERVER_ERROR_UNAVAILABLE, + std::string("unable to find '") + *model_path + + "' for model instance '" + Name() + "'"); + } + + // If weight sharing is enabled, skip loading model if + // it is already available on the target device + std::pair device_pair; + if (enable_weight_sharing_) { + device_pair = std::make_pair(!device.is_cpu(), device.index()); + auto mit = torch_models_.find(device_pair); + if (mit != torch_models_.end()) { + *torch_model = mit->second; + LOG_MESSAGE( + TRITONSERVER_LOG_INFO, + (std::string("Reusing TorchScript model for instance '") + Name() + + "'") + .c_str()); + return nullptr; // success + } + } + + // Serialize the torch model to string + std::string model_data_str; + RETURN_IF_ERROR(ReadTextFile(*model_path, &model_data_str)); + + // InferenceMode should be used to guard all tensors operations including + // model loading: https://pytorch.org/cppdocs/notes/inference_mode.html + torch::InferenceMode infer_guard(EnabledInferenceMode()); + + try { + std::istringstream model_stream(model_data_str); + if (kind == TRITONSERVER_INSTANCEGROUPKIND_MODEL) { + // Load the model without selecting a device. + torch_model->reset( + new torch::jit::Module(torch::jit::load(model_stream))); + } else { + torch_model->reset( + new torch::jit::Module(torch::jit::load(model_stream, device))); + } + } + catch (const std::exception& ex) { + return TRITONSERVER_ErrorNew( + TRITONSERVER_ERROR_INTERNAL, + ("failed to load model '" + Name() + "': " + ex.what()).c_str()); + } + + if (enable_weight_sharing_) { + if (!((torch_models_.emplace(device_pair, *torch_model)).second)) { + std::string type = device.is_cpu() ? "CPU" : "GPU"; + LOG_MESSAGE( + TRITONSERVER_LOG_WARN, + (std::string("Model already found on target ") + type + " device " + + "(id " + std::to_string(device.index()) + ") for '" + Name() + "'") + .c_str()); + } + } + + return nullptr; // success +} + +const std::map>& +ModelState::ModelOutputs() +{ + return model_outputs_; +} + +TRITONSERVER_Error* +ModelState::ParseParameters() +{ + triton::common::TritonJson::Value params; + bool status = model_config_.Find("parameters", ¶ms); + if (status) { + // If 'DISABLE_OPTIMIZED_EXECUTION' is not present in 'parameters' then no + // update is made to 'enable_optimized_execution_'. + bool disable_optimized_execution = false; + TRITONSERVER_Error* err = ParseParameter( + params, "DISABLE_OPTIMIZED_EXECUTION", &disable_optimized_execution); + if (err != nullptr) { + if (TRITONSERVER_ErrorCode(err) != TRITONSERVER_ERROR_NOT_FOUND) { + return err; + } else { + TRITONSERVER_ErrorDelete(err); + } + } + enable_optimized_execution_ = !disable_optimized_execution; + + LOG_MESSAGE( + TRITONSERVER_LOG_INFO, + (std::string("Optimized execution is ") + + (enable_optimized_execution_ ? "enabled" : "disabled") + + " for model instance '" + Name() + "'") + .c_str()); + + // If 'ENABLE_CACHE_CLEANING' is not present in 'parameters' then + // no update is made to 'enable_cache_cleaning_'. + err = ParseParameter( + params, "ENABLE_CACHE_CLEANING", &enable_cache_cleaning_); + if (err != nullptr) { + if (TRITONSERVER_ErrorCode(err) != TRITONSERVER_ERROR_NOT_FOUND) { + return err; + } else { + TRITONSERVER_ErrorDelete(err); + } + } + + LOG_MESSAGE( + TRITONSERVER_LOG_INFO, + (std::string("Cache Cleaning is ") + + (enable_cache_cleaning_ ? "enabled" : "disabled") + + " for model instance '" + Name() + "'") + .c_str()); + + // If 'INFERENCE_MODE' is not present in 'parameters' then no update is made + // to 'enable_inference_mode_'. + err = ParseParameter(params, "INFERENCE_MODE", &enable_inference_mode_); + if (err != nullptr) { + if (TRITONSERVER_ErrorCode(err) != TRITONSERVER_ERROR_NOT_FOUND) { + return err; + } else { + TRITONSERVER_ErrorDelete(err); + } + } + LOG_MESSAGE( + TRITONSERVER_LOG_INFO, + (std::string("Inference Mode is ") + + (enable_inference_mode_ ? "enabled" : "disabled") + + " for model instance '" + Name() + "'") + .c_str()); + + // If 'DISABLE_CUDNN' is not present in 'parameters' then no update is made + // to 'enable_cudnn_'. + bool disable_cudnn = false; + err = ParseParameter(params, "DISABLE_CUDNN", &disable_cudnn); + if (err != nullptr) { + if (TRITONSERVER_ErrorCode(err) != TRITONSERVER_ERROR_NOT_FOUND) { + return err; + } else { + TRITONSERVER_ErrorDelete(err); + } + } + enable_cudnn_ = !disable_cudnn; + LOG_MESSAGE( + TRITONSERVER_LOG_INFO, + (std::string("cuDNN is ") + (enable_cudnn_ ? "enabled" : "disabled") + + " for model instance '" + Name() + "'") + .c_str()); + + // If 'ENABLE_TENSOR_FUSER' is not present in 'parameters' then no + // update is made to 'enable_tensor_fuser'. + bool enable_tensor_fuser = false; + err = ParseParameter(params, "ENABLE_TENSOR_FUSER", &enable_tensor_fuser); + if (err != nullptr) { + if (TRITONSERVER_ErrorCode(err) != TRITONSERVER_ERROR_NOT_FOUND) { + return err; + } else { + TRITONSERVER_ErrorDelete(err); + } + } else { + enable_tensor_fuser_pair_ = {true, enable_tensor_fuser}; + LOG_MESSAGE( + TRITONSERVER_LOG_INFO, + (std::string("Tensor fuser is ") + + (enable_tensor_fuser ? "enabled" : "disabled") + + " for model instance '" + Name() + "'") + .c_str()); + } + + // If 'ENABLE_WEIGHT_SHARING' is not present in 'parameters' then no + // update is made to 'enable_weight_sharing'. + err = ParseParameter( + params, "ENABLE_WEIGHT_SHARING", &enable_weight_sharing_); + if (err != nullptr) { + if (TRITONSERVER_ErrorCode(err) != TRITONSERVER_ERROR_NOT_FOUND) { + return err; + } else { + TRITONSERVER_ErrorDelete(err); + } + } else { + LOG_MESSAGE( + TRITONSERVER_LOG_INFO, + (std::string("Weight sharing is ") + + (enable_weight_sharing_ ? "enabled" : "disabled") + + " for model instance '" + Name() + "'") + .c_str()); + } + + // If 'ENABLE_JIT_PROFILING' is not present in 'parameters' then no update + // is made to 'enable_jit_profiling'. + bool enable_jit_profiling = false; + err = ParseParameter(params, "ENABLE_JIT_PROFILING", &enable_jit_profiling); + if (err != nullptr) { + if (TRITONSERVER_ErrorCode(err) != TRITONSERVER_ERROR_NOT_FOUND) { + return err; + } else { + TRITONSERVER_ErrorDelete(err); + } + } else { + enable_jit_profiling_pair_ = {true, enable_jit_profiling}; + LOG_MESSAGE( + TRITONSERVER_LOG_INFO, + (std::string("Jit profiling is ") + + (enable_jit_profiling ? "enabled" : "disabled") + + " for model instance '" + Name() + "'") + .c_str()); + } + + // If 'ENABLE_JIT_EXECUTOR' is not present in 'parameters' then no update is + // made to 'enable_jit_executor'. + bool enable_jit_executor = false; + err = ParseParameter(params, "ENABLE_JIT_EXECUTOR", &enable_jit_executor); + if (err != nullptr) { + if (TRITONSERVER_ErrorCode(err) != TRITONSERVER_ERROR_NOT_FOUND) { + return err; + } else { + TRITONSERVER_ErrorDelete(err); + } + } else { + enable_jit_executor_pair_ = {true, enable_jit_executor}; + LOG_MESSAGE( + TRITONSERVER_LOG_INFO, + (std::string("Jit executor is ") + + (enable_jit_executor ? "enabled" : "disabled") + + " for model instance '" + Name() + "'") + .c_str()); + } + + // If 'INTRA_OP_THREAD_COUNT' is not present in 'parameters' then no update + // is made to 'intra_op_thread_count', which by default will take all + // threads + int intra_op_thread_count = -1; + err = + ParseParameter(params, "INTRA_OP_THREAD_COUNT", &intra_op_thread_count); + if (err != nullptr) { + if (TRITONSERVER_ErrorCode(err) != TRITONSERVER_ERROR_NOT_FOUND) { + return err; + } else { + TRITONSERVER_ErrorDelete(err); + } + } else { + if (intra_op_thread_count > 0) { + // at::set_num_threads() does not throw if called more than once, but + // issues warnings. std::call_once() is useful to limit these. + std::call_once(pytorch_intraop_threads_flag, [intra_op_thread_count]() { + at::set_num_threads(intra_op_thread_count); + }); + LOG_MESSAGE( + TRITONSERVER_LOG_INFO, + (std::string("Intra op thread count is set to ") + + std::to_string(at::get_num_threads()) + " for model instance '" + + Name() + "'") + .c_str()); + } + } + + // If 'INTER_OP_THREAD_COUNT' is not present in 'parameters' then no update + // is made to 'inter_op_thread_count', which by default will take all + // threads + int inter_op_thread_count = -1; + err = + ParseParameter(params, "INTER_OP_THREAD_COUNT", &inter_op_thread_count); + if (err != nullptr) { + if (TRITONSERVER_ErrorCode(err) != TRITONSERVER_ERROR_NOT_FOUND) { + return err; + } else { + TRITONSERVER_ErrorDelete(err); + } + } else { + if (inter_op_thread_count > 0) { + // at::set_num_interop_threads() throws if called more than once. + // std::call_once() should prevent this, but try/catch is additionally + // used for safety. + std::call_once(pytorch_interop_threads_flag, [inter_op_thread_count]() { + try { + at::set_num_interop_threads(inter_op_thread_count); + } + catch (const c10::Error& e) { + // do nothing + } + }); + LOG_MESSAGE( + TRITONSERVER_LOG_INFO, + (std::string("Inter op thread count is set to ") + + std::to_string(at::get_num_interop_threads()) + + " for model instance '" + Name() + "'") + .c_str()); + } + } + } + + return nullptr; +} + +} // namespace triton::backend::pytorch diff --git a/src/model_state.hh b/src/model_state.hh new file mode 100644 index 0000000..1a404b8 --- /dev/null +++ b/src/model_state.hh @@ -0,0 +1,131 @@ +// Copyright 2019-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions +// are met: +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// * Neither the name of NVIDIA CORPORATION nor the names of its +// contributors may be used to endorse or promote products derived +// from this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +#pragma once + +#include + +#include +#include +#include + +#include "libtorch_utils.h" +#include "naming_convention.hh" +#include "triton/backend/backend_common.h" +#include "triton/backend/backend_input_collector.h" +#include "triton/backend/backend_memory.h" +#include "triton/backend/backend_model.h" +#include "triton/backend/backend_model_instance.h" +#include "triton/backend/backend_output_responder.h" +#include "triton/common/nvtx.h" +#include "triton/core/tritonbackend.h" + +// for thread control +// https://pytorch.org/docs/stable/notes/cpu_threading_torchscript_inference.html#runtime-api +// https://github.com/pytorch/pytorch/blob/v2.2.1-rc3/aten/src/ATen/Parallel.h#L133 +#include + + +namespace triton::backend::pytorch { + +class ModelState : public triton::backend::BackendModel { + private: + // Flag to indicate whether optimized execution is enabled. Defaults to true. + bool enable_optimized_execution_; + + // Flag to indicate whether inference mode is enabled. Defaults to false. + bool enable_inference_mode_; + + // Flag to indicate whether cudnn is enabled. Defaults to true. + bool enable_cudnn_; + + // Flag to indicate whether cache cleaning after each run is enabled. + // Defaults to false. + bool enable_cache_cleaning_; + + // Flag to indicate whether weight sharing is enabled. Defaults to false. + bool enable_weight_sharing_; + + // Flag pairs to indicate if various JIT settings are set and + // enabled respectively. Defaults to (false, true). Default behavior + // is to do nothing if not explicitly set. + std::pair enable_tensor_fuser_pair_; + std::pair enable_jit_profiling_pair_; + std::pair enable_jit_executor_pair_; + + // Model mapping for shared TorchScript model across all instances on the + // same device. The key is a pair of isGPU and device index. + std::map< + std::pair, std::shared_ptr> + torch_models_; + + // model_outputs is a map that contains unique outputs that the model must + // provide. The first pair is the model output index and the second is + // the index in the model state, -1 is used if one is not required. + // In the model configuration, the output in the state configuration + // can have intersection with the outputs section of the model. If an output + // is specified both in the output section and state section, it indicates + // that the backend must return the output state to the client too. + std::map> model_outputs_; + + public: + virtual ~ModelState() = default; + + static TRITONSERVER_Error* Create( + TRITONBACKEND_Model* triton_model, ModelState** state); + + bool EnabledCacheCleaning(); + + bool EnabledCudnn(); + + bool EnabledInferenceMode(); + + const std::pair& EnabledJitExecutor() const; + + const std::pair& EnabledJitProfiling() const; + + bool EnabledOptimizedExecution(); + + const std::pair& EnabledTensorExprFuser() const; + + bool EnabledWeightSharing(); + + TRITONSERVER_Error* LoadModel( + const std::string& artifact_name, const torch::Device device, + std::string* model_path, const TRITONSERVER_InstanceGroupKind& kind, + std::shared_ptr* torch_model); + + const std::map>& ModelOutputs(); + + private: + ModelState(TRITONBACKEND_Model* triton_model); + + TRITONSERVER_Error* AutoCompleteConfig(); + + TRITONSERVER_Error* ParseParameters(); +}; + +} // namespace triton::backend::pytorch diff --git a/src/naming_convention.hh b/src/naming_convention.hh new file mode 100644 index 0000000..756cba4 --- /dev/null +++ b/src/naming_convention.hh @@ -0,0 +1,40 @@ +// Copyright 2019-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions +// are met: +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// * Neither the name of NVIDIA CORPORATION nor the names of its +// contributors may be used to endorse or promote products derived +// from this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +#pragma once + + +namespace triton::backend::pytorch { + +// The naming convention followed for inputs/outputs in the model configuration. +// Outputs don't support FORWARD_ARGUMENT. +enum class NamingConvention { + NAMED_INDEX, + FORWARD_ARGUMENT, + STRICT_CONFIG_ORDERING +}; + +} // namespace triton::backend::pytorch diff --git a/src/string_utilities.cc b/src/string_utilities.cc new file mode 100644 index 0000000..7e7555f --- /dev/null +++ b/src/string_utilities.cc @@ -0,0 +1,254 @@ +// Copyright 2019-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions +// are met: +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// * Neither the name of NVIDIA CORPORATION nor the names of its +// contributors may be used to endorse or promote products derived +// from this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +#include "string_utilities.hh" + + +namespace triton::backend::pytorch { + +// This function will return a tensor's contents as a contiguous +// chunk in system memory. In some cases this will require copying the data. +// If that happens, 'contiguous_buffer' will be set to hold the contiguous +// chunk and 'cuda_copy' will be set to indicate whether CUDA copy is +// conducted. The data copy can be avoided if the input is already in +// a contiguous chunk and the input is located in memory type and id +// specified. +TRITONSERVER_Error* +GetContiguousInputContent( + TRITONBACKEND_Input* rinput, const uint32_t buffer_count, + const char** content, size_t* content_byte_size, + std::vector* contiguous_buffer, cudaStream_t stream, bool* cuda_copy) +{ + *cuda_copy = false; + + // Check input buffers to see if data copy is necessary + size_t chunk_count = 0; + bool type_mismatch = false; + uint64_t total_byte_size = 0; + for (size_t idx = 0; idx < buffer_count; ++idx) { + TRITONSERVER_MemoryType src_memory_type; + int64_t src_memory_type_id; + size_t src_byte_size; + const void* src_ptr; + + RETURN_IF_ERROR(TRITONBACKEND_InputBuffer( + rinput, idx, &src_ptr, &src_byte_size, &src_memory_type, + &src_memory_type_id)); + + if (src_ptr != nullptr) { + chunk_count++; + total_byte_size += src_byte_size; + type_mismatch |= (src_memory_type == TRITONSERVER_MEMORY_GPU); + } + } + + if (chunk_count == 0) { + *content = nullptr; + *content_byte_size = 0; + } else if ((chunk_count == 1) && !type_mismatch) { + TRITONSERVER_MemoryType src_memory_type; + int64_t src_memory_type_id; + RETURN_IF_ERROR(TRITONBACKEND_InputBuffer( + rinput, 0, (const void**)content, content_byte_size, &src_memory_type, + &src_memory_type_id)); + } else { + contiguous_buffer->resize(total_byte_size); + + size_t offset = 0; + for (size_t i = 0; i < chunk_count; i++) { + bool cuda_used; + TRITONSERVER_MemoryType src_memory_type; + int64_t src_memory_type_id; + size_t src_byte_size; + const void* src_ptr; + + RETURN_IF_ERROR(TRITONBACKEND_InputBuffer( + rinput, i, &src_ptr, &src_byte_size, &src_memory_type, + &src_memory_type_id)); + RETURN_IF_ERROR(CopyBuffer( + "Contiguous input", src_memory_type, src_memory_type_id, + TRITONSERVER_MEMORY_CPU, 0, src_byte_size, src_ptr, + contiguous_buffer->data() + offset, stream, &cuda_used)); + *cuda_copy |= cuda_used; + offset += src_byte_size; + } + + *content = contiguous_buffer->data(); + *content_byte_size = total_byte_size; + } + + return nullptr; // success +} + +void +FillStringTensor(torch::List* input_list, const size_t cnt) +{ + for (size_t c = 0; c < cnt; ++c) { + input_list->push_back(""); + } +} + +bool +SetStringBuffer( + torch::List* tensor, TRITONBACKEND_Response** response, + TRITONBACKEND_Output* response_output, TRITONBACKEND_State* response_state, + const size_t tensor_element_count, cudaStream_t stream, + std::string* serialized, bool state) +{ + bool cuda_copy = false; + + // Serialize the output tensor strings. Each string is serialized as + // a 4-byte length followed by the string itself with no + // null-terminator. + serialized->clear(); + for (size_t e = 0; e < tensor_element_count; ++e) { + std::string str = tensor->get(e).to(); + const char* cstr = str.c_str(); + size_t len = str.length(); + serialized->append(reinterpret_cast(&len), sizeof(uint32_t)); + if (len > 0) { + serialized->append(cstr, len); + } + } + + // Allocate a buffer large enough to hold the serialized tensor. + TRITONSERVER_MemoryType actual_memory_type = TRITONSERVER_MEMORY_CPU; + int64_t actual_memory_type_id = 0; + + TRITONSERVER_Error* err; + void* buffer; + + if (!state) { + auto err = TRITONBACKEND_OutputBuffer( + response_output, &buffer, serialized->size(), &actual_memory_type, + &actual_memory_type_id); + if (err != nullptr) { + RESPOND_AND_SET_NULL_IF_ERROR(response, err); + return cuda_copy; + } + } else { + auto err = TRITONBACKEND_StateBuffer( + response_state, &buffer, serialized->size(), &actual_memory_type, + &actual_memory_type_id); + if (err != nullptr) { + RESPOND_AND_SET_NULL_IF_ERROR(response, err); + return cuda_copy; + } + } + // Copy the serialized tensor into the allocated buffer. + bool cuda_used = false; + err = CopyBuffer( + "String output", TRITONSERVER_MEMORY_CPU /* src_memory_type */, + 0 /* src_memory_type_id */, actual_memory_type, actual_memory_type_id, + serialized->size(), reinterpret_cast(serialized->c_str()), + buffer, stream, &cuda_used); + cuda_copy |= cuda_used; + + if (err != nullptr) { + RESPOND_AND_SET_NULL_IF_ERROR(response, err); + return cuda_copy; + } + + if (state) { + RESPOND_AND_SET_NULL_IF_ERROR( + response, TRITONBACKEND_StateUpdate(response_state)); + } + + return cuda_copy; +} + +bool +SetStringInputTensor( + torch::List* input_list, TRITONBACKEND_Input* input, + const char* name, const uint32_t buffer_count, + const size_t request_element_cnt, TRITONBACKEND_Response** response, + cudaStream_t stream, const char* host_policy_name) +{ + bool cuda_copy = false; + + // For string data type, we always need to have the data on CPU so + // that we can read string length and construct the string + // properly. So if the request's input tensor is not in CPU need to + // copy it there. + const char* content = nullptr; + size_t content_byte_size = 0; + + std::vector contiguous_buffer; + auto err = GetContiguousInputContent( + input, buffer_count, &content, &content_byte_size, &contiguous_buffer, + stream, &cuda_copy); + if (err != nullptr) { + RESPOND_AND_SET_NULL_IF_ERROR(response, err); + FillStringTensor(input_list, request_element_cnt); + return cuda_copy; + } + +#ifdef TRITON_ENABLE_GPU + if (cuda_copy) { + cudaStreamSynchronize(stream); + cuda_copy = false; + } +#endif // TRITON_ENABLE_GPU + + std::vector> str_list; + err = ValidateStringBuffer( + content, content_byte_size, request_element_cnt, name, &str_list); + // Set string values. + for (const auto& [addr, len] : str_list) { + input_list->push_back(std::string(addr, len)); + } + + size_t element_cnt = str_list.size(); + if (err != nullptr) { + RESPOND_AND_SET_NULL_IF_ERROR(response, err); + FillStringTensor(input_list, request_element_cnt - element_cnt); + } + return cuda_copy; +} + +bool +SetStringOutputBuffer( + torch::List* tensor, TRITONBACKEND_Response** response, + TRITONBACKEND_Output* response_output, const size_t tensor_element_count, + cudaStream_t stream, std::string* serialized) +{ + return SetStringBuffer( + tensor, response, response_output, nullptr /* response_state */, + tensor_element_count, stream, serialized, false /* state */); +} + +bool +SetStringStateBuffer( + torch::List* tensor, TRITONBACKEND_Response** response, + TRITONBACKEND_State* response_state, const size_t tensor_element_count, + cudaStream_t stream, std::string* serialized) +{ + return SetStringBuffer( + tensor, response, nullptr /* response_output */, response_state, + tensor_element_count, stream, serialized, true /* state */); +} + +} // namespace triton::backend::pytorch diff --git a/src/string_utilities.hh b/src/string_utilities.hh new file mode 100644 index 0000000..8373478 --- /dev/null +++ b/src/string_utilities.hh @@ -0,0 +1,106 @@ +// Copyright 2019-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions +// are met: +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// * Neither the name of NVIDIA CORPORATION nor the names of its +// contributors may be used to endorse or promote products derived +// from this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +#pragma once + +#include + +#include +#include +#include + +#include "libtorch_utils.h" +#include "triton/backend/backend_common.h" +#include "triton/backend/backend_input_collector.h" +#include "triton/backend/backend_memory.h" +#include "triton/backend/backend_model.h" +#include "triton/backend/backend_model_instance.h" +#include "triton/backend/backend_output_responder.h" +#include "triton/common/nvtx.h" +#include "triton/core/tritonbackend.h" + +#ifdef TRITON_PYTORCH_ENABLE_TORCHVISION +// Suppress warnings in torch headers +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wsign-compare" +#pragma warning(push, 0) +#include +#include // Torchvision header +#pragma warning(pop) +#pragma GCC diagnostic pop +#endif // TRITON_PYTORCH_ENABLE_TORCHVISION + +#ifdef TRITON_ENABLE_GPU +#include +#include +#include +#endif // TRITON_ENABLE_GPU + +// for thread control +// https://pytorch.org/docs/stable/notes/cpu_threading_torchscript_inference.html#runtime-api +// https://github.com/pytorch/pytorch/blob/v2.2.1-rc3/aten/src/ATen/Parallel.h#L133 +#include + + +namespace triton::backend::pytorch { + +void FillStringTensor(torch::List* input_list, const size_t cnt); + +// This function will return a tensor's contents as a contiguous +// chunk in system memory. In some cases this will require copying the data. +// If that happens, 'contiguous_buffer' will be set to hold the contiguous +// chunk and 'cuda_copy' will be set to indicate whether CUDA copy is +// conducted. The data copy can be avoided if the input is already in +// a contiguous chunk and the input is located in memory type and id +// specified. +TRITONSERVER_Error* GetContiguousInputContent( + TRITONBACKEND_Input* rinput, const uint32_t buffer_count, + const char** content, size_t* content_byte_size, + std::vector* contiguous_buffer, cudaStream_t stream, bool* cuda_copy); + +bool SetStringBuffer( + torch::List* tensor, TRITONBACKEND_Response** response, + TRITONBACKEND_Output* response_output, TRITONBACKEND_State* response_state, + const size_t tensor_element_count, cudaStream_t stream, + std::string* serialized, bool state); + +bool SetStringInputTensor( + torch::List* input_list, TRITONBACKEND_Input* input, + const char* name, const uint32_t buffer_count, + const size_t request_element_cnt, TRITONBACKEND_Response** response, + cudaStream_t stream, const char* host_policy_name); + +bool SetStringOutputBuffer( + torch::List* tensor, TRITONBACKEND_Response** response, + TRITONBACKEND_Output* response_output, const size_t tensor_element_count, + cudaStream_t stream, std::string* serialized); + +bool SetStringStateBuffer( + torch::List* tensor, TRITONBACKEND_Response** response, + TRITONBACKEND_State* response_state, const size_t tensor_element_count, + cudaStream_t stream, std::string* serialized); + +} // namespace triton::backend::pytorch From 1d5092ea52a1164f6901cdbcb1056a8f47e70453 Mon Sep 17 00:00:00 2001 From: J Wyman Date: Mon, 3 Nov 2025 15:04:07 -0500 Subject: [PATCH 2/2] Accept Rename Renamed 'string_utilities.*' to 'string_utils.*' as requested. --- CMakeLists.txt | 2 +- src/{string_utilities.cc => string_utils.cc} | 2 +- src/{string_utilities.hh => string_utils.hh} | 0 3 files changed, 2 insertions(+), 2 deletions(-) rename src/{string_utilities.cc => string_utils.cc} (99%) rename src/{string_utilities.hh => string_utils.hh} (100%) diff --git a/CMakeLists.txt b/CMakeLists.txt index 227aa27..d3d9dd0 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -282,7 +282,7 @@ add_library( src/libtorch_utils.h src/model_instance_state.cc src/model_state.cc - src/string_utilities.cc + src/string_utils.cc ) add_library( diff --git a/src/string_utilities.cc b/src/string_utils.cc similarity index 99% rename from src/string_utilities.cc rename to src/string_utils.cc index 7e7555f..a605c7c 100644 --- a/src/string_utilities.cc +++ b/src/string_utils.cc @@ -24,7 +24,7 @@ // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -#include "string_utilities.hh" +#include "string_utils.hh" namespace triton::backend::pytorch { diff --git a/src/string_utilities.hh b/src/string_utils.hh similarity index 100% rename from src/string_utilities.hh rename to src/string_utils.hh