Skip to content

Commit db84765

Browse files
committed
Add parameters support to InferResponse
* Infer response to track parameters * Add parameters to binding infer response * Rank parameters argument up among InferResponse constructor arguments * Add setting parameters to Triton response * Send response parameters only on non-error * Fix double declaration * Unify py dictionary parameters to json str
1 parent b771f4f commit db84765

File tree

4 files changed

+159
-82
lines changed

4 files changed

+159
-82
lines changed

src/infer_response.cc

Lines changed: 66 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
// Copyright 2021-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
1+
// Copyright 2021-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
22
//
33
// Redistribution and use in source and binary forms, with or without
44
// modification, are permitted provided that the following conditions
@@ -39,8 +39,10 @@ namespace triton { namespace backend { namespace python {
3939

4040
InferResponse::InferResponse(
4141
const std::vector<std::shared_ptr<PbTensor>>& output_tensors,
42-
std::shared_ptr<PbError> error, const bool is_last_response, void* id)
43-
: error_(error), is_last_response_(is_last_response), id_(id)
42+
std::shared_ptr<PbError> error, std::string parameters,
43+
const bool is_last_response, void* id)
44+
: error_(error), is_last_response_(is_last_response), id_(id),
45+
parameters_(std::move(parameters))
4446
{
4547
for (auto& output : output_tensors) {
4648
if (!output) {
@@ -58,6 +60,12 @@ InferResponse::OutputTensors()
5860
return output_tensors_;
5961
}
6062

63+
std::string&
64+
InferResponse::Parameters()
65+
{
66+
return parameters_;
67+
}
68+
6169
bool
6270
InferResponse::HasError()
6371
{
@@ -106,6 +114,9 @@ InferResponse::SaveToSharedMemory(
106114
j++;
107115
}
108116
response_shm_ptr->id = id_;
117+
118+
parameters_shm_ = PbString::Create(shm_pool, parameters_);
119+
response_shm_ptr->parameters = parameters_shm_->ShmHandle();
109120
}
110121
}
111122

@@ -143,6 +154,8 @@ InferResponse::LoadFromSharedMemory(
143154

144155
std::shared_ptr<PbError> pb_error;
145156
std::vector<std::shared_ptr<PbTensor>> output_tensors;
157+
std::shared_ptr<PbString> parameters_shm;
158+
std::string parameters;
146159

147160
// If the error field is set, do not load output tensors from shared memory.
148161
if (response_shm_ptr->has_error && response_shm_ptr->is_error_set) {
@@ -154,33 +167,43 @@ InferResponse::LoadFromSharedMemory(
154167
bi::managed_external_buffer::handle_t* tensor_handle_shm =
155168
reinterpret_cast<bi::managed_external_buffer::handle_t*>(
156169
response_shm.data_.get() + sizeof(ResponseShm));
170+
{
157171
#ifdef TRITON_PB_STUB
158-
// Need to acquire the GIL to avoid hangs.
159-
py::gil_scoped_acquire acquire;
172+
// Need to acquire the GIL to avoid hangs.
173+
py::gil_scoped_acquire acquire;
160174
#endif
161-
for (size_t idx = 0; idx < requested_output_count; ++idx) {
162-
std::shared_ptr<PbTensor> pb_tensor = PbTensor::LoadFromSharedMemory(
163-
shm_pool, tensor_handle_shm[idx], open_cuda_handle);
164-
output_tensors.emplace_back(std::move(pb_tensor));
175+
for (size_t idx = 0; idx < requested_output_count; ++idx) {
176+
std::shared_ptr<PbTensor> pb_tensor = PbTensor::LoadFromSharedMemory(
177+
shm_pool, tensor_handle_shm[idx], open_cuda_handle);
178+
output_tensors.emplace_back(std::move(pb_tensor));
179+
}
165180
}
181+
182+
parameters_shm = std::move(
183+
PbString::LoadFromSharedMemory(shm_pool, response_shm_ptr->parameters));
184+
parameters = parameters_shm->String();
166185
}
167186

168187
return std::unique_ptr<InferResponse>(new InferResponse(
169188
response_shm, output_tensors, pb_error,
170-
response_shm_ptr->is_last_response, response_shm_ptr->id));
189+
response_shm_ptr->is_last_response, response_shm_ptr->id, parameters_shm,
190+
parameters));
171191
}
172192

173193
InferResponse::InferResponse(
174194
AllocatedSharedMemory<char>& response_shm,
175195
std::vector<std::shared_ptr<PbTensor>>& output_tensors,
176-
std::shared_ptr<PbError>& pb_error, const bool is_last_response, void* id)
196+
std::shared_ptr<PbError>& pb_error, const bool is_last_response, void* id,
197+
std::shared_ptr<PbString>& parameters_shm, std::string& parameters)
177198
{
178199
response_shm_ = std::move(response_shm);
179200
output_tensors_ = std::move(output_tensors);
180201
error_ = std::move(pb_error);
181202
shm_handle_ = response_shm_.handle_;
182203
id_ = id;
183204
is_last_response_ = is_last_response;
205+
parameters_shm_ = std::move(parameters_shm);
206+
parameters_ = std::move(parameters);
184207
}
185208

186209
std::shared_ptr<PbError>&
@@ -387,6 +410,38 @@ InferResponse::Send(
387410
cuda_copy |= cuda_used;
388411
}
389412

413+
if (!parameters_.empty()) {
414+
triton::common::TritonJson::Value param;
415+
THROW_IF_TRITON_ERROR(
416+
param.Parse(parameters_.c_str(), parameters_.length()));
417+
std::vector<std::string> param_keys;
418+
THROW_IF_TRITON_ERROR(param.Members(&param_keys));
419+
for (const auto& key : param_keys) {
420+
triton::common::TritonJson::Value value;
421+
if (!param.Find(key.c_str(), &value)) {
422+
throw PythonBackendException("Unexpected missing key on parameters");
423+
}
424+
if (value.IsString()) {
425+
std::string string_value;
426+
THROW_IF_TRITON_ERROR(value.AsString(&string_value));
427+
THROW_IF_TRITON_ERROR(TRITONBACKEND_ResponseSetStringParameter(
428+
response, key.c_str(), string_value.c_str()));
429+
} else if (value.IsInt()) {
430+
int64_t int_value = 0;
431+
THROW_IF_TRITON_ERROR(value.AsInt(&int_value));
432+
THROW_IF_TRITON_ERROR(TRITONBACKEND_ResponseSetIntParameter(
433+
response, key.c_str(), int_value));
434+
} else if (value.IsBool()) {
435+
bool bool_value = false;
436+
THROW_IF_TRITON_ERROR(value.AsBool(&bool_value));
437+
THROW_IF_TRITON_ERROR(TRITONBACKEND_ResponseSetBoolParameter(
438+
response, key.c_str(), bool_value));
439+
} else {
440+
throw PythonBackendException("Unsupported value type on parameters");
441+
}
442+
}
443+
}
444+
390445
#ifdef TRITON_ENABLE_GPU
391446
if (cuda_copy) {
392447
cudaStreamSynchronize(reinterpret_cast<cudaStream_t>(cuda_stream));

src/infer_response.h

Lines changed: 9 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
// Copyright 2021-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
1+
// Copyright 2021-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
22
//
33
// Redistribution and use in source and binary forms, with or without
44
// modification, are permitted provided that the following conditions
@@ -38,6 +38,7 @@ namespace triton { namespace backend { namespace python {
3838

3939
struct ResponseShm {
4040
uint32_t outputs_size;
41+
bi::managed_external_buffer::handle_t parameters;
4142
bi::managed_external_buffer::handle_t error;
4243
bool has_error;
4344
// Indicates whether this error has a message or not.
@@ -72,9 +73,10 @@ class InferResponse {
7273
public:
7374
InferResponse(
7475
const std::vector<std::shared_ptr<PbTensor>>& output_tensors,
75-
std::shared_ptr<PbError> error = nullptr,
76+
std::shared_ptr<PbError> error = nullptr, std::string parameters = "",
7677
const bool is_last_response = true, void* id = nullptr);
7778
std::vector<std::shared_ptr<PbTensor>>& OutputTensors();
79+
std::string& Parameters();
7880
void SaveToSharedMemory(
7981
std::unique_ptr<SharedMemoryManager>& shm_pool, bool copy_gpu = true);
8082
static std::unique_ptr<InferResponse> LoadFromSharedMemory(
@@ -116,8 +118,8 @@ class InferResponse {
116118
InferResponse(
117119
AllocatedSharedMemory<char>& response_shm,
118120
std::vector<std::shared_ptr<PbTensor>>& output_tensors,
119-
std::shared_ptr<PbError>& pb_error, const bool is_last_response,
120-
void* id);
121+
std::shared_ptr<PbError>& pb_error, const bool is_last_response, void* id,
122+
std::shared_ptr<PbString>& parameters_shm, std::string& parameters);
121123
std::vector<std::shared_ptr<PbTensor>> output_tensors_;
122124

123125
std::shared_ptr<PbError> error_;
@@ -128,6 +130,9 @@ class InferResponse {
128130
bool is_last_response_;
129131
// Representing the request id that the response was created from.
130132
void* id_;
133+
134+
std::shared_ptr<PbString> parameters_shm_;
135+
std::string parameters_;
131136
};
132137

133138
}}} // namespace triton::backend::python

src/pb_stub.cc

Lines changed: 72 additions & 59 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
// Copyright 2021-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
1+
// Copyright 2021-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
22
//
33
// Redistribution and use in source and binary forms, with or without
44
// modification, are permitted provided that the following conditions
@@ -104,6 +104,28 @@ PyDefaultArgumentToMutableType(const py::object& argument)
104104
std::string(py::str(argument.get_type())));
105105
}
106106

107+
std::string
108+
PyParametersToJSON(const py::dict& parameters)
109+
{
110+
for (const auto& pair : parameters) {
111+
if (!py::isinstance<py::str>(pair.first)) {
112+
throw PythonBackendException(
113+
"Expect parameters keys to have type str, found type " +
114+
std::string(py::str(pair.first.get_type())));
115+
}
116+
if (!py::isinstance<py::bool_>(pair.second) &&
117+
!py::isinstance<py::int_>(pair.second) &&
118+
!py::isinstance<py::str>(pair.second)) {
119+
throw PythonBackendException(
120+
"Expect parameters values to have type bool/int/str, found type " +
121+
std::string(py::str(pair.second.get_type())));
122+
}
123+
}
124+
py::module_ py_json = py::module_::import("json");
125+
std::string parameters_str = py::str(py_json.attr("dumps")(parameters));
126+
return parameters_str;
127+
}
128+
107129
void
108130
AsyncEventFutureDoneCallback(const py::object& py_future)
109131
{
@@ -1714,59 +1736,41 @@ PYBIND11_EMBEDDED_MODULE(c_python_backend_utils, module)
17141736
py::class_<InferRequest, std::shared_ptr<InferRequest>>(
17151737
module, "InferenceRequest")
17161738
.def(
1717-
py::init([](const std::string& request_id,
1718-
const py::object& correlation_id,
1719-
const std::vector<std::shared_ptr<PbTensor>>& inputs,
1720-
const std::vector<std::string>& requested_output_names,
1721-
const std::string& model_name,
1722-
const int64_t model_version, const uint32_t flags,
1723-
const uint64_t timeout,
1724-
const PreferredMemory& preferred_memory,
1725-
const InferenceTrace& trace,
1726-
const py::object& parameters_) {
1727-
py::dict parameters =
1728-
PyDefaultArgumentToMutableType<py::dict>(parameters_);
1729-
std::set<std::string> requested_outputs;
1730-
for (auto& requested_output_name : requested_output_names) {
1731-
requested_outputs.emplace(requested_output_name);
1732-
}
1733-
for (const auto& pair : parameters) {
1734-
if (!py::isinstance<py::str>(pair.first)) {
1735-
throw PythonBackendException(
1736-
"Expect parameters keys to have type str, found type " +
1737-
std::string(py::str(pair.first.get_type())));
1738-
}
1739-
if (!py::isinstance<py::bool_>(pair.second) &&
1740-
!py::isinstance<py::int_>(pair.second) &&
1741-
!py::isinstance<py::str>(pair.second)) {
1742-
throw PythonBackendException(
1743-
"Expect parameters values to have type bool/int/str, found "
1744-
"type " +
1745-
std::string(py::str(pair.second.get_type())));
1746-
}
1747-
}
1748-
py::module_ py_json = py::module_::import("json");
1749-
std::string parameters_str =
1750-
py::str(py_json.attr("dumps")(parameters));
1751-
1752-
CorrelationId correlation_id_obj;
1753-
if (py::isinstance<py::int_>(correlation_id)) {
1754-
correlation_id_obj =
1755-
CorrelationId(py::cast<uint64_t>(correlation_id));
1756-
} else if (py::isinstance<py::str>(correlation_id)) {
1757-
correlation_id_obj =
1758-
CorrelationId(py::cast<std::string>(correlation_id));
1759-
} else {
1760-
throw PythonBackendException(
1761-
"Correlation ID must be integer or string");
1762-
}
1763-
1764-
return std::make_shared<InferRequest>(
1765-
request_id, correlation_id_obj, inputs, requested_outputs,
1766-
model_name, model_version, parameters_str, flags, timeout,
1767-
0 /*response_factory_address*/, 0 /*request_address*/,
1768-
preferred_memory, trace);
1769-
}),
1739+
py::init(
1740+
[](const std::string& request_id,
1741+
const py::object& correlation_id,
1742+
const std::vector<std::shared_ptr<PbTensor>>& inputs,
1743+
const std::vector<std::string>& requested_output_names,
1744+
const std::string& model_name, const int64_t model_version,
1745+
const uint32_t flags, const uint64_t timeout,
1746+
const PreferredMemory& preferred_memory,
1747+
const InferenceTrace& trace, const py::object& parameters_) {
1748+
py::dict parameters =
1749+
PyDefaultArgumentToMutableType<py::dict>(parameters_);
1750+
std::set<std::string> requested_outputs;
1751+
for (auto& requested_output_name : requested_output_names) {
1752+
requested_outputs.emplace(requested_output_name);
1753+
}
1754+
std::string parameters_str = PyParametersToJSON(parameters);
1755+
1756+
CorrelationId correlation_id_obj;
1757+
if (py::isinstance<py::int_>(correlation_id)) {
1758+
correlation_id_obj =
1759+
CorrelationId(py::cast<uint64_t>(correlation_id));
1760+
} else if (py::isinstance<py::str>(correlation_id)) {
1761+
correlation_id_obj =
1762+
CorrelationId(py::cast<std::string>(correlation_id));
1763+
} else {
1764+
throw PythonBackendException(
1765+
"Correlation ID must be integer or string");
1766+
}
1767+
1768+
return std::make_shared<InferRequest>(
1769+
request_id, correlation_id_obj, inputs, requested_outputs,
1770+
model_name, model_version, parameters_str, flags, timeout,
1771+
0 /*response_factory_address*/, 0 /*request_address*/,
1772+
preferred_memory, trace);
1773+
}),
17701774
py::arg("request_id").none(false) = "",
17711775
py::arg("correlation_id").none(false) = 0,
17721776
py::arg("inputs").none(false),
@@ -1869,16 +1873,25 @@ PYBIND11_EMBEDDED_MODULE(c_python_backend_utils, module)
18691873
py::class_<InferResponse, std::shared_ptr<InferResponse>>(
18701874
module, "InferenceResponse")
18711875
.def(
1872-
py::init<
1873-
const std::vector<std::shared_ptr<PbTensor>>&,
1874-
std::shared_ptr<PbError>>(),
1876+
py::init(
1877+
[](const std::vector<std::shared_ptr<PbTensor>>& output_tensors,
1878+
const std::shared_ptr<PbError>& error,
1879+
const py::object& parameters_) {
1880+
py::dict parameters =
1881+
PyDefaultArgumentToMutableType<py::dict>(parameters_);
1882+
std::string parameters_str = PyParametersToJSON(parameters);
1883+
return std::make_shared<InferResponse>(
1884+
output_tensors, error, parameters_str /* parameters */);
1885+
}),
18751886
py::arg("output_tensors") = py::list(),
1876-
py::arg("error") = static_cast<std::shared_ptr<PbError>>(nullptr))
1887+
py::arg("error") = static_cast<std::shared_ptr<PbError>>(nullptr),
1888+
py::arg("parameters") = py::none())
18771889
.def(
18781890
"output_tensors", &InferResponse::OutputTensors,
18791891
py::return_value_policy::reference)
18801892
.def("has_error", &InferResponse::HasError)
1881-
.def("error", &InferResponse::Error);
1893+
.def("error", &InferResponse::Error)
1894+
.def("parameters", &InferResponse::Parameters);
18821895

18831896
py::class_<ResponseSender, std::shared_ptr<ResponseSender>>(
18841897
module, "InferenceResponseSender")

src/request_executor.cc

Lines changed: 12 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
// Copyright 2021-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
1+
// Copyright 2021-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
22
//
33
// Redistribution and use in source and binary forms, with or without
44
// modification, are permitted provided that the following conditions
@@ -153,20 +153,22 @@ InferResponseComplete(
153153
output_tensors.clear();
154154
}
155155

156+
// TODO: [DLIS-7864] Pass response parameters from BLS response.
156157
if (!infer_payload->IsDecoupled()) {
157158
infer_response = std::make_unique<InferResponse>(
158-
output_tensors, pb_error, true /* is_last_response */);
159+
output_tensors, pb_error, "" /* parameters */,
160+
true /* is_last_response */);
159161
} else {
160162
if ((flags & TRITONSERVER_RESPONSE_COMPLETE_FINAL) == 0) {
161163
// Not the last response.
162164
infer_response = std::make_unique<InferResponse>(
163-
output_tensors, pb_error, false /* is_last_response */,
164-
userp /* id */);
165+
output_tensors, pb_error, "" /* parameters */,
166+
false /* is_last_response */, userp /* id */);
165167
} else {
166168
// The last response.
167169
infer_response = std::make_unique<InferResponse>(
168-
output_tensors, pb_error, true /* is_last_response */,
169-
userp /* id */);
170+
output_tensors, pb_error, "" /* parameters */,
171+
true /* is_last_response */, userp /* id */);
170172
}
171173
}
172174

@@ -178,11 +180,13 @@ InferResponseComplete(
178180
(flags & TRITONSERVER_RESPONSE_COMPLETE_FINAL) != 0) {
179181
// An empty response may be the last response for decoupled models.
180182
infer_response = std::make_unique<InferResponse>(
181-
output_tensors, pb_error, true /* is_last_response */, userp /* id */);
183+
output_tensors, pb_error, "" /* parameters */,
184+
true /* is_last_response */, userp /* id */);
182185
} else {
183186
pb_error = std::make_shared<PbError>("Unexpected empty response.");
184187
infer_response = std::make_unique<InferResponse>(
185-
output_tensors, pb_error, true /* is_last_response */, userp /* id */);
188+
output_tensors, pb_error, "" /* parameters */,
189+
true /* is_last_response */, userp /* id */);
186190
}
187191

188192
infer_payload->SetValue(std::move(infer_response));

0 commit comments

Comments
 (0)