triton-inference-server
diff --git a/‎CMakeLists.txt‎
Lines changed: 3 additions & 0 deletions b/‎CMakeLists.txt‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎src/_static_.cc‎
Lines changed: 254 additions & 0 deletions b/‎src/_static_.cc‎
Lines changed: 254 additions & 0 deletions
diff --git a/‎src/_static_.hh‎
Lines changed: 106 additions & 0 deletions b/‎src/_static_.hh‎
Lines changed: 106 additions & 0 deletions
@@ -280,6 +280,9 @@ add_library(
   src/libtorch.cc
   src/libtorch_utils.cc
   src/libtorch_utils.h
+  src/model_instance_state.cc
+  src/model_state.cc
+  src/_static_.cc
 )
 
 add_library(
 
@@ -0,0 +1,254 @@
+// Copyright 2019-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//  * Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimer in the
+//    documentation and/or other materials provided with the distribution.
+//  * Neither the name of NVIDIA CORPORATION nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+#include "_static_.hh"
+
+
+namespace triton::backend::pytorch {
+
+// This function will return a tensor's contents as a contiguous
+// chunk in system memory. In some cases this will require copying the data.
+// If that  happens, 'contiguous_buffer' will be set to hold the contiguous
+// chunk and 'cuda_copy' will be set to indicate whether CUDA copy is
+// conducted.  The data copy can be avoided if the input is already in
+// a contiguous chunk and the input is located in memory type and id
+// specified.
+TRITONSERVER_Error*
+GetContiguousInputContent(
+    TRITONBACKEND_Input* rinput, const uint32_t buffer_count,
+    const char** content, size_t* content_byte_size,
+    std::vector<char>* contiguous_buffer, cudaStream_t stream, bool* cuda_copy)
+{
+  *cuda_copy = false;
+
+  // Check input buffers to see if data copy is necessary
+  size_t chunk_count = 0;
+  bool type_mismatch = false;
+  uint64_t total_byte_size = 0;
+  for (size_t idx = 0; idx < buffer_count; ++idx) {
+    TRITONSERVER_MemoryType src_memory_type;
+    int64_t src_memory_type_id;
+    size_t src_byte_size;
+    const void* src_ptr;
+
+    RETURN_IF_ERROR(TRITONBACKEND_InputBuffer(
+        rinput, idx, &src_ptr, &src_byte_size, &src_memory_type,
+        &src_memory_type_id));
+
+    if (src_ptr != nullptr) {
+      chunk_count++;
+      total_byte_size += src_byte_size;
+      type_mismatch |= (src_memory_type == TRITONSERVER_MEMORY_GPU);
+    }
+  }
+
+  if (chunk_count == 0) {
+    *content = nullptr;
+    *content_byte_size = 0;
+  } else if ((chunk_count == 1) && !type_mismatch) {
+    TRITONSERVER_MemoryType src_memory_type;
+    int64_t src_memory_type_id;
+    RETURN_IF_ERROR(TRITONBACKEND_InputBuffer(
+        rinput, 0, (const void**)content, content_byte_size, &src_memory_type,
+        &src_memory_type_id));
+  } else {
+    contiguous_buffer->resize(total_byte_size);
+
+    size_t offset = 0;
+    for (size_t i = 0; i < chunk_count; i++) {
+      bool cuda_used;
+      TRITONSERVER_MemoryType src_memory_type;
+      int64_t src_memory_type_id;
+      size_t src_byte_size;
+      const void* src_ptr;
+
+      RETURN_IF_ERROR(TRITONBACKEND_InputBuffer(
+          rinput, i, &src_ptr, &src_byte_size, &src_memory_type,
+          &src_memory_type_id));
+      RETURN_IF_ERROR(CopyBuffer(
+          "Contiguous input", src_memory_type, src_memory_type_id,
+          TRITONSERVER_MEMORY_CPU, 0, src_byte_size, src_ptr,
+          contiguous_buffer->data() + offset, stream, &cuda_used));
+      *cuda_copy |= cuda_used;
+      offset += src_byte_size;
+    }
+
+    *content = contiguous_buffer->data();
+    *content_byte_size = total_byte_size;
+  }
+
+  return nullptr;  // success
+}
+
+void
+FillStringTensor(torch::List<std::string>* input_list, const size_t cnt)
+{
+  for (size_t c = 0; c < cnt; ++c) {
+    input_list->push_back("");
+  }
+}
+
+bool
+SetStringBuffer(
+    torch::List<torch::jit::IValue>* tensor, TRITONBACKEND_Response** response,
+    TRITONBACKEND_Output* response_output, TRITONBACKEND_State* response_state,
+    const size_t tensor_element_count, cudaStream_t stream,
+    std::string* serialized, bool state)
+{
+  bool cuda_copy = false;
+
+  // Serialize the output tensor strings. Each string is serialized as
+  // a 4-byte length followed by the string itself with no
+  // null-terminator.
+  serialized->clear();
+  for (size_t e = 0; e < tensor_element_count; ++e) {
+    std::string str = tensor->get(e).to<std::string>();
+    const char* cstr = str.c_str();
+    size_t len = str.length();
+    serialized->append(reinterpret_cast<const char*>(&len), sizeof(uint32_t));
+    if (len > 0) {
+      serialized->append(cstr, len);
+    }
+  }
+
+  // Allocate a buffer large enough to hold the serialized tensor.
+  TRITONSERVER_MemoryType actual_memory_type = TRITONSERVER_MEMORY_CPU;
+  int64_t actual_memory_type_id = 0;
+
+  TRITONSERVER_Error* err;
+  void* buffer;
+
+  if (!state) {
+    auto err = TRITONBACKEND_OutputBuffer(
+        response_output, &buffer, serialized->size(), &actual_memory_type,
+        &actual_memory_type_id);
+    if (err != nullptr) {
+      RESPOND_AND_SET_NULL_IF_ERROR(response, err);
+      return cuda_copy;
+    }
+  } else {
+    auto err = TRITONBACKEND_StateBuffer(
+        response_state, &buffer, serialized->size(), &actual_memory_type,
+        &actual_memory_type_id);
+    if (err != nullptr) {
+      RESPOND_AND_SET_NULL_IF_ERROR(response, err);
+      return cuda_copy;
+    }
+  }
+  // Copy the serialized tensor into the allocated buffer.
+  bool cuda_used = false;
+  err = CopyBuffer(
+      "String output", TRITONSERVER_MEMORY_CPU /* src_memory_type */,
+      0 /* src_memory_type_id */, actual_memory_type, actual_memory_type_id,
+      serialized->size(), reinterpret_cast<const void*>(serialized->c_str()),
+      buffer, stream, &cuda_used);
+  cuda_copy |= cuda_used;
+
+  if (err != nullptr) {
+    RESPOND_AND_SET_NULL_IF_ERROR(response, err);
+    return cuda_copy;
+  }
+
+  if (state) {
+    RESPOND_AND_SET_NULL_IF_ERROR(
+        response, TRITONBACKEND_StateUpdate(response_state));
+  }
+
+  return cuda_copy;
+}
+
+bool
+SetStringInputTensor(
+    torch::List<std::string>* input_list, TRITONBACKEND_Input* input,
+    const char* name, const uint32_t buffer_count,
+    const size_t request_element_cnt, TRITONBACKEND_Response** response,
+    cudaStream_t stream, const char* host_policy_name)
+{
+  bool cuda_copy = false;
+
+  // For string data type, we always need to have the data on CPU so
+  // that we can read string length and construct the string
+  // properly. So if the request's input tensor is not in CPU need to
+  // copy it there.
+  const char* content = nullptr;
+  size_t content_byte_size = 0;
+
+  std::vector<char> contiguous_buffer;
+  auto err = GetContiguousInputContent(
+      input, buffer_count, &content, &content_byte_size, &contiguous_buffer,
+      stream, &cuda_copy);
+  if (err != nullptr) {
+    RESPOND_AND_SET_NULL_IF_ERROR(response, err);
+    FillStringTensor(input_list, request_element_cnt);
+    return cuda_copy;
+  }
+
+#ifdef TRITON_ENABLE_GPU
+  if (cuda_copy) {
+    cudaStreamSynchronize(stream);
+    cuda_copy = false;
+  }
+#endif  // TRITON_ENABLE_GPU
+
+  std::vector<std::pair<const char*, const uint32_t>> str_list;
+  err = ValidateStringBuffer(
+      content, content_byte_size, request_element_cnt, name, &str_list);
+  // Set string values.
+  for (const auto& [addr, len] : str_list) {
+    input_list->push_back(std::string(addr, len));
+  }
+
+  size_t element_cnt = str_list.size();
+  if (err != nullptr) {
+    RESPOND_AND_SET_NULL_IF_ERROR(response, err);
+    FillStringTensor(input_list, request_element_cnt - element_cnt);
+  }
+  return cuda_copy;
+}
+
+bool
+SetStringOutputBuffer(
+    torch::List<torch::jit::IValue>* tensor, TRITONBACKEND_Response** response,
+    TRITONBACKEND_Output* response_output, const size_t tensor_element_count,
+    cudaStream_t stream, std::string* serialized)
+{
+  return SetStringBuffer(
+      tensor, response, response_output, nullptr /* response_state */,
+      tensor_element_count, stream, serialized, false /* state */);
+}
+
+bool
+SetStringStateBuffer(
+    torch::List<torch::jit::IValue>* tensor, TRITONBACKEND_Response** response,
+    TRITONBACKEND_State* response_state, const size_t tensor_element_count,
+    cudaStream_t stream, std::string* serialized)
+{
+  return SetStringBuffer(
+      tensor, response, nullptr /* response_output */, response_state,
+      tensor_element_count, stream, serialized, true /* state */);
+}
+
+}  // namespace triton::backend::pytorch
@@ -0,0 +1,106 @@
+// Copyright 2019-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//  * Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimer in the
+//    documentation and/or other materials provided with the distribution.
+//  * Neither the name of NVIDIA CORPORATION nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+#pragma once
+
+#include <stdint.h>
+
+#include <cstdint>
+#include <exception>
+#include <mutex>
+
+#include "libtorch_utils.h"
+#include "triton/backend/backend_common.h"
+#include "triton/backend/backend_input_collector.h"
+#include "triton/backend/backend_memory.h"
+#include "triton/backend/backend_model.h"
+#include "triton/backend/backend_model_instance.h"
+#include "triton/backend/backend_output_responder.h"
+#include "triton/common/nvtx.h"
+#include "triton/core/tritonbackend.h"
+
+#ifdef TRITON_PYTORCH_ENABLE_TORCHVISION
+// Suppress warnings in torch headers
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wsign-compare"
+#pragma warning(push, 0)
+#include <torchvision/ops/ops.h>
+#include <torchvision/vision.h>  // Torchvision header
+#pragma warning(pop)
+#pragma GCC diagnostic pop
+#endif  // TRITON_PYTORCH_ENABLE_TORCHVISION
+
+#ifdef TRITON_ENABLE_GPU
+#include <c10/cuda/CUDACachingAllocator.h>
+#include <c10/cuda/CUDAGuard.h>
+#include <cuda_runtime_api.h>
+#endif  // TRITON_ENABLE_GPU
+
+// for thread control
+// https://pytorch.org/docs/stable/notes/cpu_threading_torchscript_inference.html#runtime-api
+// https://github.com/pytorch/pytorch/blob/v2.2.1-rc3/aten/src/ATen/Parallel.h#L133
+#include <ATen/Parallel.h>
+
+
+namespace triton::backend::pytorch {
+
+void FillStringTensor(torch::List<std::string>* input_list, const size_t cnt);
+
+// This function will return a tensor's contents as a contiguous
+// chunk in system memory. In some cases this will require copying the data.
+// If that  happens, 'contiguous_buffer' will be set to hold the contiguous
+// chunk and 'cuda_copy' will be set to indicate whether CUDA copy is
+// conducted.  The data copy can be avoided if the input is already in
+// a contiguous chunk and the input is located in memory type and id
+// specified.
+TRITONSERVER_Error* GetContiguousInputContent(
+    TRITONBACKEND_Input* rinput, const uint32_t buffer_count,
+    const char** content, size_t* content_byte_size,
+    std::vector<char>* contiguous_buffer, cudaStream_t stream, bool* cuda_copy);
+
+bool SetStringBuffer(
+    torch::List<torch::jit::IValue>* tensor, TRITONBACKEND_Response** response,
+    TRITONBACKEND_Output* response_output, TRITONBACKEND_State* response_state,
+    const size_t tensor_element_count, cudaStream_t stream,
+    std::string* serialized, bool state);
+
+bool SetStringInputTensor(
+    torch::List<std::string>* input_list, TRITONBACKEND_Input* input,
+    const char* name, const uint32_t buffer_count,
+    const size_t request_element_cnt, TRITONBACKEND_Response** response,
+    cudaStream_t stream, const char* host_policy_name);
+
+bool SetStringOutputBuffer(
+    torch::List<torch::jit::IValue>* tensor, TRITONBACKEND_Response** response,
+    TRITONBACKEND_Output* response_output, const size_t tensor_element_count,
+    cudaStream_t stream, std::string* serialized);
+
+bool SetStringStateBuffer(
+    torch::List<torch::jit::IValue>* tensor, TRITONBACKEND_Response** response,
+    TRITONBACKEND_State* response_state, const size_t tensor_element_count,
+    cudaStream_t stream, std::string* serialized);
+
+}  // namespace triton::backend::pytorch
Original file line number	Diff line number	Diff line change
`@@ -280,6 +280,9 @@ add_library(`
`280`	`280`	`src/libtorch.cc`
`281`	`281`	`src/libtorch_utils.cc`
`282`	`282`	`src/libtorch_utils.h`
	`283`	`+ src/model_instance_state.cc`
	`284`	`+ src/model_state.cc`
	`285`	`+ src/_static_.cc`
`283`	`286`	`)`
`284`	`287`
`285`	`288`	`add_library(`