Skip to content

Commit 08ecf30

Browse files
authored
Implement numpy array over CPU OrtValues on return values (#20539)
### Description Create numpy arrays based on the native buffers of returned OrtValues. Hold on to the OrtValue until the numpy array is garbage collected. ### Motivation and Context This saves cpu on tensor copies and addresses customer concerns.
1 parent 156d521 commit 08ecf30

File tree

7 files changed

+175
-118
lines changed

7 files changed

+175
-118
lines changed

onnxruntime/python/onnxruntime_inference_collection.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -646,7 +646,7 @@ def get_outputs_as_ortvaluevector(self):
646646
return self._iobinding.get_outputs()
647647

648648
def copy_outputs_to_cpu(self):
649-
"""Copy output contents to CPU (if on another device). No-op if already on the CPU."""
649+
"""Copy output contents to CPU."""
650650
return self._iobinding.copy_outputs_to_cpu()
651651

652652
def clear_binding_inputs(self):

onnxruntime/python/onnxruntime_pybind_iobinding.cc

Lines changed: 11 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -161,23 +161,27 @@ void addIoBindingMethods(pybind11::module& m) {
161161
return io_binding->Get()->GetOutputs();
162162
},
163163
py::return_value_policy::reference_internal)
164-
.def("copy_outputs_to_cpu", [](const SessionIOBinding* io_binding) -> std::vector<py::object> {
164+
.def("copy_outputs_to_cpu", [](const SessionIOBinding* io_binding) -> py::list {
165165
const std::vector<OrtValue>& outputs = io_binding->Get()->GetOutputs();
166-
std::vector<py::object> rfetch;
167-
rfetch.reserve(outputs.size());
166+
168167
size_t pos = 0;
169168
const auto& dtm = io_binding->GetInferenceSession()->GetDataTransferManager();
169+
170+
py::list result;
170171
for (const auto& ort_value : outputs) {
171172
if (ort_value.IsTensor()) {
172-
rfetch.push_back(AddTensorAsPyObj(ort_value, &dtm, nullptr));
173+
// We make a copy of the tensor to CPU even if it is already on CPU
174+
// as the function name implies using DataTransferManager.
175+
py::array arr = PrimitiveTensorToNumpyFromDevice(ort_value, &dtm);
176+
result.append(py::cast<py::object>(arr));
173177
} else if (ort_value.IsSparseTensor()) {
174-
rfetch.push_back(GetPyObjectFromSparseTensor(pos, ort_value, &dtm));
178+
result.append(GetPyObjectFromSparseTensor(pos, ort_value, &dtm));
175179
} else {
176-
rfetch.push_back(AddNonTensorAsPyObj(ort_value, &dtm, nullptr));
180+
result.append(AddNonTensorAsPyObj(ort_value, &dtm, nullptr));
177181
}
178182
++pos;
179183
}
180-
return rfetch;
184+
return result;
181185
});
182186
}
183187

onnxruntime/python/onnxruntime_pybind_mlvalue.h

Lines changed: 40 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,8 @@
1616
#include "core/framework/ort_value.h"
1717
#include "core/session/inference_session.h"
1818

19+
#include <variant>
20+
1921
PYBIND11_MAKE_OPAQUE(std::vector<OrtValue>);
2022

2123
namespace onnxruntime {
@@ -40,6 +42,8 @@ MLDataType NumpyTypeToOnnxRuntimeTensorType(int numpy_type);
4042

4143
using MemCpyFunc = void (*)(void*, const void*, size_t);
4244

45+
using DataTransferAlternative = std::variant<const DataTransferManager*, MemCpyFunc>;
46+
4347
void CpuToCpuMemCpy(void*, const void*, size_t);
4448

4549
void CopyDataToTensor(const pybind11::array& py_array, int npy_type, Tensor& tensor, MemCpyFunc mem_cpy_to_device = CpuToCpuMemCpy);
@@ -117,9 +121,42 @@ void CreateGenericMLValue(const onnxruntime::InputDefList* input_def_list, const
117121
const std::string& name_input, const pybind11::object& value, OrtValue* p_mlvalue,
118122
bool accept_only_numpy_array = false, bool use_numpy_data_memory = true, MemCpyFunc mem_cpy_to_device = CpuToCpuMemCpy);
119123

120-
void GetPyObjFromTensor(const Tensor& rtensor, pybind11::object& obj,
121-
const DataTransferManager* data_transfer_manager = nullptr,
122-
const std::unordered_map<OrtDevice::DeviceType, MemCpyFunc>* mem_cpy_to_host_functions = nullptr);
124+
pybind11::object GetPyObjFromTensor(const OrtValue& rtensor,
125+
const DataTransferManager* data_transfer_manager = nullptr,
126+
const std::unordered_map<OrtDevice::DeviceType, MemCpyFunc>* mem_cpy_to_host_functions = nullptr);
127+
128+
// The below two functions are used to convert OrtValue to numpy arrays
129+
130+
/// <summary>
131+
/// This function operates on string tensors. Strings are always
132+
/// copied to python and converted to UTF-16/UCS-4/32 depending on the platform.
133+
/// This is accomplished using py::cast()
134+
///
135+
/// It is an error to pass a non-tensor or a non-string tensor to this function.
136+
/// </summary>
137+
/// <param name="tensor">Tensor that contains strings</param>
138+
/// <returns>py::array object</returns>
139+
pybind11::array StringTensorToNumpyArray(const Tensor& tensor);
140+
141+
/// <summary>
142+
/// Creates a numpy array with shape over OrtValue memory. Numpy array
143+
/// does not own the memory, but it holds a copy or OrtValue in a py::capsule.
144+
/// OrtValue is destroyed when the numpy array is garbage collected.
145+
/// This is used when the OrtValue memory is on CPU.
146+
/// </summary>
147+
/// <param name="ort_value">OrtValue with data</param>
148+
/// <returns>numpy array</returns>
149+
pybind11::array PrimitiveTensorToNumpyOverOrtValue(const OrtValue& ort_value);
150+
151+
/// <summary>
152+
/// Creates a numpy array with shape with a copy of OrtValue data.
153+
/// This function is used when the OrtValue memory is not on CPU.
154+
/// </summary>
155+
/// <param name="ort_value">Source memory that is not on CPU.</param>
156+
/// <param name="data_transfer">a variant encapsulating alternatives for copying data</param>
157+
/// <returns></returns>
158+
pybind11::array PrimitiveTensorToNumpyFromDevice(const OrtValue& ort_value,
159+
const DataTransferAlternative& data_transfer);
123160

124161
template <class T>
125162
struct DecRefFn {

onnxruntime/python/onnxruntime_pybind_ortvalue.cc

Lines changed: 10 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -233,20 +233,20 @@ void addOrtValueMethods(pybind11::module& m) {
233233
#endif
234234
})
235235
.def("shape", [](const OrtValue* ort_value) -> py::list {
236-
py::list shape_arr;
237236
#if !defined(DISABLE_SPARSE_TENSORS)
238237
// OrtValue can only be a Tensor/SparseTensor, make this generic to handle non-Tensors
239238
ORT_ENFORCE(ort_value->IsTensor() || ort_value->IsSparseTensor(),
240239
"Only OrtValues that are Tensors/SpareTensors are currently supported");
241240

242-
const auto& dims = (ort_value->IsTensor())
243-
? ort_value->Get<Tensor>().Shape().GetDims()
244-
: ort_value->Get<SparseTensor>().DenseShape().GetDims();
241+
const auto dims = (ort_value->IsTensor())
242+
? ort_value->Get<Tensor>().Shape().GetDims()
243+
: ort_value->Get<SparseTensor>().DenseShape().GetDims();
245244
#else
246245
ORT_ENFORCE(ort_value->IsTensor(), "Only OrtValues that are Tensors are supported in this build");
247-
const auto& dims = ort_value->Get<Tensor>().Shape().GetDims();
246+
const auto dims = ort_value->Get<Tensor>().Shape().GetDims();
248247
#endif
249248

249+
py::list shape_arr;
250250
for (auto dim : dims) {
251251
// For sequence tensors - we would append a list of dims to the outermost list
252252
// For now only tensors are supported in OrtValue
@@ -302,18 +302,16 @@ void addOrtValueMethods(pybind11::module& m) {
302302
.def("numpy", [](const OrtValue* ml_value) -> py::object {
303303
ORT_ENFORCE(ml_value->IsTensor(), "Only OrtValues that are Tensors are convertible to Numpy objects");
304304

305-
py::object obj;
306-
307305
#ifdef USE_CUDA
308-
GetPyObjFromTensor(ml_value->Get<Tensor>(), obj, nullptr, GetCudaToHostMemCpyFunction());
306+
py::object obj = GetPyObjFromTensor(*ml_value, nullptr, GetCudaToHostMemCpyFunction());
309307
#elif USE_ROCM
310-
GetPyObjFromTensor(ml_value->Get<Tensor>(), obj, nullptr, GetRocmToHostMemCpyFunction());
308+
py::object obj = GetPyObjFromTensor(*ml_value, nullptr, GetRocmToHostMemCpyFunction());
311309
#elif USE_CANN
312-
GetPyObjFromTensor(ml_value->Get<Tensor>(), obj, nullptr, GetCannToHostMemCpyFunction());
310+
py::object obj = GetPyObjFromTensor(*ml_value, nullptr, GetCannToHostMemCpyFunction());
313311
#elif USE_DML
314-
GetPyObjFromTensor(ml_value->Get<Tensor>(), obj, nullptr, GetDmlToHostMemCpyFunction());
312+
py::object obj = GetPyObjFromTensor(*ml_value, nullptr, GetDmlToHostMemCpyFunction());
315313
#else
316-
GetPyObjFromTensor(ml_value->Get<Tensor>(), obj, nullptr, nullptr);
314+
py::object obj = GetPyObjFromTensor(*ml_value, nullptr, nullptr);
317315
#endif
318316
return obj;
319317
})

onnxruntime/python/onnxruntime_pybind_sparse_tensor.cc

Lines changed: 2 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -305,18 +305,7 @@ void addSparseTensorMethods(pybind11::module& m) {
305305
if (sparse_tensor.IsDataTypeString()) {
306306
// Strings can not be on GPU and require conversion UTF-8 to Python UNICODE
307307
// We need to create a copy.
308-
const int numpy_type = OnnxRuntimeTensorToNumpyType(DataTypeImpl::GetType<std::string>());
309-
ORT_ENFORCE(NPY_OBJECT == numpy_type, "We are expecting to map strings to NPY_OBJECT type");
310-
const auto& values_shape = sparse_tensor.Values().Shape();
311-
py::dtype dtype("object");
312-
py::array result(dtype, values_shape.GetDims(), {});
313-
auto* out_ptr = static_cast<py::object*>(
314-
PyArray_DATA(reinterpret_cast<PyArrayObject*>(result.ptr())));
315-
const std::string* src = sparse_tensor.Values().Data<std::string>();
316-
for (int64_t i = 0, size = values_shape.Size(); i < size; ++i, src++) {
317-
out_ptr[i] = py::cast(*src);
318-
}
319-
return result;
308+
return StringTensorToNumpyArray(sparse_tensor.Values());
320309
} else {
321310
utils::MLTypeCallDispatcher<float, double, int8_t, uint8_t, int16_t, uint16_t, int32_t, uint32_t, int64_t, uint64_t>
322311
t_disp(sparse_tensor.GetElementType());
@@ -386,7 +375,7 @@ void addSparseTensorMethods(pybind11::module& m) {
386375
})
387376
.def("dense_shape", [](const PySparseTensor* py_tensor) -> py::list {
388377
const SparseTensor& st = py_tensor->Instance();
389-
const auto& dims = st.DenseShape().GetDims();
378+
const auto dims = st.DenseShape().GetDims();
390379
// We create a copy of dimensions, it is small
391380
py::list py_dims;
392381
for (auto d : dims) {

0 commit comments

Comments
 (0)