Skip to content

Commit 2e66c48

Browse files
committed
maintenance: Separate Code into Separate Files
This change breaks the monolithic src/libtorch.cc into multiple files, with a modern separation of classes into separate header and code files.
1 parent 6d02d35 commit 2e66c48

File tree

10 files changed

+2902
-2489
lines changed

10 files changed

+2902
-2489
lines changed

CMakeLists.txt

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -280,6 +280,9 @@ add_library(
280280
src/libtorch.cc
281281
src/libtorch_utils.cc
282282
src/libtorch_utils.h
283+
src/model_instance_state.cc
284+
src/model_state.cc
285+
src/_static_.cc
283286
)
284287

285288
add_library(

src/_static_.cc

Lines changed: 254 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,254 @@
1+
// Copyright 2019-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2+
//
3+
// Redistribution and use in source and binary forms, with or without
4+
// modification, are permitted provided that the following conditions
5+
// are met:
6+
// * Redistributions of source code must retain the above copyright
7+
// notice, this list of conditions and the following disclaimer.
8+
// * Redistributions in binary form must reproduce the above copyright
9+
// notice, this list of conditions and the following disclaimer in the
10+
// documentation and/or other materials provided with the distribution.
11+
// * Neither the name of NVIDIA CORPORATION nor the names of its
12+
// contributors may be used to endorse or promote products derived
13+
// from this software without specific prior written permission.
14+
//
15+
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
16+
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17+
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
18+
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
19+
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
20+
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
21+
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
22+
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
23+
// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24+
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
25+
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26+
27+
#include "_static_.hh"
28+
29+
30+
namespace triton::backend::pytorch {
31+
32+
// This function will return a tensor's contents as a contiguous
33+
// chunk in system memory. In some cases this will require copying the data.
34+
// If that happens, 'contiguous_buffer' will be set to hold the contiguous
35+
// chunk and 'cuda_copy' will be set to indicate whether CUDA copy is
36+
// conducted. The data copy can be avoided if the input is already in
37+
// a contiguous chunk and the input is located in memory type and id
38+
// specified.
39+
TRITONSERVER_Error*
40+
GetContiguousInputContent(
41+
TRITONBACKEND_Input* rinput, const uint32_t buffer_count,
42+
const char** content, size_t* content_byte_size,
43+
std::vector<char>* contiguous_buffer, cudaStream_t stream, bool* cuda_copy)
44+
{
45+
*cuda_copy = false;
46+
47+
// Check input buffers to see if data copy is necessary
48+
size_t chunk_count = 0;
49+
bool type_mismatch = false;
50+
uint64_t total_byte_size = 0;
51+
for (size_t idx = 0; idx < buffer_count; ++idx) {
52+
TRITONSERVER_MemoryType src_memory_type;
53+
int64_t src_memory_type_id;
54+
size_t src_byte_size;
55+
const void* src_ptr;
56+
57+
RETURN_IF_ERROR(TRITONBACKEND_InputBuffer(
58+
rinput, idx, &src_ptr, &src_byte_size, &src_memory_type,
59+
&src_memory_type_id));
60+
61+
if (src_ptr != nullptr) {
62+
chunk_count++;
63+
total_byte_size += src_byte_size;
64+
type_mismatch |= (src_memory_type == TRITONSERVER_MEMORY_GPU);
65+
}
66+
}
67+
68+
if (chunk_count == 0) {
69+
*content = nullptr;
70+
*content_byte_size = 0;
71+
} else if ((chunk_count == 1) && !type_mismatch) {
72+
TRITONSERVER_MemoryType src_memory_type;
73+
int64_t src_memory_type_id;
74+
RETURN_IF_ERROR(TRITONBACKEND_InputBuffer(
75+
rinput, 0, (const void**)content, content_byte_size, &src_memory_type,
76+
&src_memory_type_id));
77+
} else {
78+
contiguous_buffer->resize(total_byte_size);
79+
80+
size_t offset = 0;
81+
for (size_t i = 0; i < chunk_count; i++) {
82+
bool cuda_used;
83+
TRITONSERVER_MemoryType src_memory_type;
84+
int64_t src_memory_type_id;
85+
size_t src_byte_size;
86+
const void* src_ptr;
87+
88+
RETURN_IF_ERROR(TRITONBACKEND_InputBuffer(
89+
rinput, i, &src_ptr, &src_byte_size, &src_memory_type,
90+
&src_memory_type_id));
91+
RETURN_IF_ERROR(CopyBuffer(
92+
"Contiguous input", src_memory_type, src_memory_type_id,
93+
TRITONSERVER_MEMORY_CPU, 0, src_byte_size, src_ptr,
94+
contiguous_buffer->data() + offset, stream, &cuda_used));
95+
*cuda_copy |= cuda_used;
96+
offset += src_byte_size;
97+
}
98+
99+
*content = contiguous_buffer->data();
100+
*content_byte_size = total_byte_size;
101+
}
102+
103+
return nullptr; // success
104+
}
105+
106+
void
107+
FillStringTensor(torch::List<std::string>* input_list, const size_t cnt)
108+
{
109+
for (size_t c = 0; c < cnt; ++c) {
110+
input_list->push_back("");
111+
}
112+
}
113+
114+
bool
115+
SetStringBuffer(
116+
torch::List<torch::jit::IValue>* tensor, TRITONBACKEND_Response** response,
117+
TRITONBACKEND_Output* response_output, TRITONBACKEND_State* response_state,
118+
const size_t tensor_element_count, cudaStream_t stream,
119+
std::string* serialized, bool state)
120+
{
121+
bool cuda_copy = false;
122+
123+
// Serialize the output tensor strings. Each string is serialized as
124+
// a 4-byte length followed by the string itself with no
125+
// null-terminator.
126+
serialized->clear();
127+
for (size_t e = 0; e < tensor_element_count; ++e) {
128+
std::string str = tensor->get(e).to<std::string>();
129+
const char* cstr = str.c_str();
130+
size_t len = str.length();
131+
serialized->append(reinterpret_cast<const char*>(&len), sizeof(uint32_t));
132+
if (len > 0) {
133+
serialized->append(cstr, len);
134+
}
135+
}
136+
137+
// Allocate a buffer large enough to hold the serialized tensor.
138+
TRITONSERVER_MemoryType actual_memory_type = TRITONSERVER_MEMORY_CPU;
139+
int64_t actual_memory_type_id = 0;
140+
141+
TRITONSERVER_Error* err;
142+
void* buffer;
143+
144+
if (!state) {
145+
auto err = TRITONBACKEND_OutputBuffer(
146+
response_output, &buffer, serialized->size(), &actual_memory_type,
147+
&actual_memory_type_id);
148+
if (err != nullptr) {
149+
RESPOND_AND_SET_NULL_IF_ERROR(response, err);
150+
return cuda_copy;
151+
}
152+
} else {
153+
auto err = TRITONBACKEND_StateBuffer(
154+
response_state, &buffer, serialized->size(), &actual_memory_type,
155+
&actual_memory_type_id);
156+
if (err != nullptr) {
157+
RESPOND_AND_SET_NULL_IF_ERROR(response, err);
158+
return cuda_copy;
159+
}
160+
}
161+
// Copy the serialized tensor into the allocated buffer.
162+
bool cuda_used = false;
163+
err = CopyBuffer(
164+
"String output", TRITONSERVER_MEMORY_CPU /* src_memory_type */,
165+
0 /* src_memory_type_id */, actual_memory_type, actual_memory_type_id,
166+
serialized->size(), reinterpret_cast<const void*>(serialized->c_str()),
167+
buffer, stream, &cuda_used);
168+
cuda_copy |= cuda_used;
169+
170+
if (err != nullptr) {
171+
RESPOND_AND_SET_NULL_IF_ERROR(response, err);
172+
return cuda_copy;
173+
}
174+
175+
if (state) {
176+
RESPOND_AND_SET_NULL_IF_ERROR(
177+
response, TRITONBACKEND_StateUpdate(response_state));
178+
}
179+
180+
return cuda_copy;
181+
}
182+
183+
bool
184+
SetStringInputTensor(
185+
torch::List<std::string>* input_list, TRITONBACKEND_Input* input,
186+
const char* name, const uint32_t buffer_count,
187+
const size_t request_element_cnt, TRITONBACKEND_Response** response,
188+
cudaStream_t stream, const char* host_policy_name)
189+
{
190+
bool cuda_copy = false;
191+
192+
// For string data type, we always need to have the data on CPU so
193+
// that we can read string length and construct the string
194+
// properly. So if the request's input tensor is not in CPU need to
195+
// copy it there.
196+
const char* content = nullptr;
197+
size_t content_byte_size = 0;
198+
199+
std::vector<char> contiguous_buffer;
200+
auto err = GetContiguousInputContent(
201+
input, buffer_count, &content, &content_byte_size, &contiguous_buffer,
202+
stream, &cuda_copy);
203+
if (err != nullptr) {
204+
RESPOND_AND_SET_NULL_IF_ERROR(response, err);
205+
FillStringTensor(input_list, request_element_cnt);
206+
return cuda_copy;
207+
}
208+
209+
#ifdef TRITON_ENABLE_GPU
210+
if (cuda_copy) {
211+
cudaStreamSynchronize(stream);
212+
cuda_copy = false;
213+
}
214+
#endif // TRITON_ENABLE_GPU
215+
216+
std::vector<std::pair<const char*, const uint32_t>> str_list;
217+
err = ValidateStringBuffer(
218+
content, content_byte_size, request_element_cnt, name, &str_list);
219+
// Set string values.
220+
for (const auto& [addr, len] : str_list) {
221+
input_list->push_back(std::string(addr, len));
222+
}
223+
224+
size_t element_cnt = str_list.size();
225+
if (err != nullptr) {
226+
RESPOND_AND_SET_NULL_IF_ERROR(response, err);
227+
FillStringTensor(input_list, request_element_cnt - element_cnt);
228+
}
229+
return cuda_copy;
230+
}
231+
232+
bool
233+
SetStringOutputBuffer(
234+
torch::List<torch::jit::IValue>* tensor, TRITONBACKEND_Response** response,
235+
TRITONBACKEND_Output* response_output, const size_t tensor_element_count,
236+
cudaStream_t stream, std::string* serialized)
237+
{
238+
return SetStringBuffer(
239+
tensor, response, response_output, nullptr /* response_state */,
240+
tensor_element_count, stream, serialized, false /* state */);
241+
}
242+
243+
bool
244+
SetStringStateBuffer(
245+
torch::List<torch::jit::IValue>* tensor, TRITONBACKEND_Response** response,
246+
TRITONBACKEND_State* response_state, const size_t tensor_element_count,
247+
cudaStream_t stream, std::string* serialized)
248+
{
249+
return SetStringBuffer(
250+
tensor, response, nullptr /* response_output */, response_state,
251+
tensor_element_count, stream, serialized, true /* state */);
252+
}
253+
254+
} // namespace triton::backend::pytorch

src/_static_.hh

Lines changed: 106 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,106 @@
1+
// Copyright 2019-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2+
//
3+
// Redistribution and use in source and binary forms, with or without
4+
// modification, are permitted provided that the following conditions
5+
// are met:
6+
// * Redistributions of source code must retain the above copyright
7+
// notice, this list of conditions and the following disclaimer.
8+
// * Redistributions in binary form must reproduce the above copyright
9+
// notice, this list of conditions and the following disclaimer in the
10+
// documentation and/or other materials provided with the distribution.
11+
// * Neither the name of NVIDIA CORPORATION nor the names of its
12+
// contributors may be used to endorse or promote products derived
13+
// from this software without specific prior written permission.
14+
//
15+
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
16+
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17+
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
18+
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
19+
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
20+
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
21+
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
22+
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
23+
// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24+
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
25+
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26+
27+
#pragma once
28+
29+
#include <stdint.h>
30+
31+
#include <cstdint>
32+
#include <exception>
33+
#include <mutex>
34+
35+
#include "libtorch_utils.h"
36+
#include "triton/backend/backend_common.h"
37+
#include "triton/backend/backend_input_collector.h"
38+
#include "triton/backend/backend_memory.h"
39+
#include "triton/backend/backend_model.h"
40+
#include "triton/backend/backend_model_instance.h"
41+
#include "triton/backend/backend_output_responder.h"
42+
#include "triton/common/nvtx.h"
43+
#include "triton/core/tritonbackend.h"
44+
45+
#ifdef TRITON_PYTORCH_ENABLE_TORCHVISION
46+
// Suppress warnings in torch headers
47+
#pragma GCC diagnostic push
48+
#pragma GCC diagnostic ignored "-Wsign-compare"
49+
#pragma warning(push, 0)
50+
#include <torchvision/ops/ops.h>
51+
#include <torchvision/vision.h> // Torchvision header
52+
#pragma warning(pop)
53+
#pragma GCC diagnostic pop
54+
#endif // TRITON_PYTORCH_ENABLE_TORCHVISION
55+
56+
#ifdef TRITON_ENABLE_GPU
57+
#include <c10/cuda/CUDACachingAllocator.h>
58+
#include <c10/cuda/CUDAGuard.h>
59+
#include <cuda_runtime_api.h>
60+
#endif // TRITON_ENABLE_GPU
61+
62+
// for thread control
63+
// https://pytorch.org/docs/stable/notes/cpu_threading_torchscript_inference.html#runtime-api
64+
// https://github.com/pytorch/pytorch/blob/v2.2.1-rc3/aten/src/ATen/Parallel.h#L133
65+
#include <ATen/Parallel.h>
66+
67+
68+
namespace triton::backend::pytorch {
69+
70+
void FillStringTensor(torch::List<std::string>* input_list, const size_t cnt);
71+
72+
// This function will return a tensor's contents as a contiguous
73+
// chunk in system memory. In some cases this will require copying the data.
74+
// If that happens, 'contiguous_buffer' will be set to hold the contiguous
75+
// chunk and 'cuda_copy' will be set to indicate whether CUDA copy is
76+
// conducted. The data copy can be avoided if the input is already in
77+
// a contiguous chunk and the input is located in memory type and id
78+
// specified.
79+
TRITONSERVER_Error* GetContiguousInputContent(
80+
TRITONBACKEND_Input* rinput, const uint32_t buffer_count,
81+
const char** content, size_t* content_byte_size,
82+
std::vector<char>* contiguous_buffer, cudaStream_t stream, bool* cuda_copy);
83+
84+
bool SetStringBuffer(
85+
torch::List<torch::jit::IValue>* tensor, TRITONBACKEND_Response** response,
86+
TRITONBACKEND_Output* response_output, TRITONBACKEND_State* response_state,
87+
const size_t tensor_element_count, cudaStream_t stream,
88+
std::string* serialized, bool state);
89+
90+
bool SetStringInputTensor(
91+
torch::List<std::string>* input_list, TRITONBACKEND_Input* input,
92+
const char* name, const uint32_t buffer_count,
93+
const size_t request_element_cnt, TRITONBACKEND_Response** response,
94+
cudaStream_t stream, const char* host_policy_name);
95+
96+
bool SetStringOutputBuffer(
97+
torch::List<torch::jit::IValue>* tensor, TRITONBACKEND_Response** response,
98+
TRITONBACKEND_Output* response_output, const size_t tensor_element_count,
99+
cudaStream_t stream, std::string* serialized);
100+
101+
bool SetStringStateBuffer(
102+
torch::List<torch::jit::IValue>* tensor, TRITONBACKEND_Response** response,
103+
TRITONBACKEND_State* response_state, const size_t tensor_element_count,
104+
cudaStream_t stream, std::string* serialized);
105+
106+
} // namespace triton::backend::pytorch

0 commit comments

Comments
 (0)