diff --git a/CMakeLists.txt b/CMakeLists.txt
index be8ce9c..30a5455 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -210,7 +210,6 @@ else()
COMMAND rm -fr openvino
COMMAND docker cp openvino_backend_ov:/opt/openvino openvino
COMMAND docker rm openvino_backend_ov
- COMMAND echo '' >> openvino/lib/plugins.xml
COMMENT "Building OpenVino"
)
endif() # WIN32
diff --git a/Dockerfile.drivers b/Dockerfile.drivers
new file mode 100644
index 0000000..e955040
--- /dev/null
+++ b/Dockerfile.drivers
@@ -0,0 +1,16 @@
+ARG BASE_IMAGE=tritonserver:latest
+FROM $BASE_IMAGE
+RUN mkdir /tmp/neo && cd /tmp/neo && \
+ apt-get update && apt-get install -y libtbb12 curl && \
+ curl -L -O https://github.com/intel/compute-runtime/releases/download/25.31.34666.3/libze-intel-gpu1_25.31.34666.3-0_amd64.deb && \
+ curl -L -O https://github.com/intel/compute-runtime/releases/download/25.31.34666.3/intel-opencl-icd_25.31.34666.3-0_amd64.deb && \
+ curl -L -O https://github.com/intel/compute-runtime/releases/download/25.31.34666.3/libigdgmm12_22.8.1_amd64.deb && \
+ curl -L -O https://github.com/intel/compute-runtime/releases/download/25.31.34666.3/intel-ocloc_25.31.34666.3-0_amd64.deb && \
+ curl -L -O https://github.com/intel/intel-graphics-compiler/releases/download/v2.16.0/intel-igc-core-2_2.16.0+19683_amd64.deb && \
+ curl -L -O https://github.com/intel/intel-graphics-compiler/releases/download/v2.16.0/intel-igc-opencl-2_2.16.0+19683_amd64.deb && \
+ curl -L -O https://github.com/intel/linux-npu-driver/releases/download/v1.23.0/linux-npu-driver-v1.23.0.20250827-17270089246-ubuntu2404.tar.gz && \
+ tar -xf linux-npu-driver-v1.23.0.20250827-17270089246-ubuntu2404.tar.gz && \
+ curl -L -O https://github.com/oneapi-src/level-zero/releases/download/v1.22.4/level-zero_1.22.4+u24.04_amd64.deb && \
+ dpkg -i *.deb && \
+ apt-get install -y ocl-icd-libopencl1 --no-install-recommends && \
+ rm -rf /var/lib/apt/lists/* && rm -Rf /tmp/neo
diff --git a/README.md b/README.md
index f6f3fd1..8096904 100644
--- a/README.md
+++ b/README.md
@@ -60,6 +60,7 @@ $ cd build
$ cmake -DCMAKE_INSTALL_PREFIX:PATH=`pwd`/install -DTRITON_BUILD_OPENVINO_VERSION=2025.2.0 -DTRITON_BUILD_CONTAINER_VERSION=25.06 ..
$ make install
```
+The compiled backend will be added to `build/install/backends/openvino` folder.
The following required Triton repositories will be pulled and used in
the build. By default the "main" branch/tag will be used for each repo
@@ -69,6 +70,27 @@ but the listed CMake argument can be used to override.
* triton-inference-server/core: -DTRITON_CORE_REPO_TAG=[tag]
* triton-inference-server/common: -DTRITON_COMMON_REPO_TAG=[tag]
+## Build a complete triton custom image with OpenVINO backend
+
+```
+git clone https://github.com/triton-inference-server/server
+cd server
+pip install distro requests
+python3 build.py --target-platform linux --enable-logging --enable-stats --enable-metrics --enable-cpu-metrics --endpoint grpc --endpoint http --filesystem s3 \
+--backend openvino
+```
+In the backend value, the pull request is optional. Use `--backend openvino` to build from `main` branch.
+It will create an image called `tritonserver:latest`
+
+## Add Intel GPU and NPU dependencies to the image
+
+The `Dockerfile.drivers` adds OpenVINO runtime drivers needed to run inference on the accelerators. Use, as the base image, public image with OpenVINO backend or the custom one.
+
+```
+docker build -f Dockerfile.drivers --build-arg BASE_IMAGE=tritonserver:latest -t tritonserver:xpu .
+```
+
+
## Using the OpenVINO Backend
### Parameters
@@ -86,6 +108,7 @@ to skip the dynamic batch sizes in backend.
* `ENABLE_BATCH_PADDING`: By default an error will be generated if backend receives a request with batch size less than max_batch_size specified in the configuration. This error can be avoided at a cost of performance by specifying `ENABLE_BATCH_PADDING` parameter as `YES`.
* `RESHAPE_IO_LAYERS`: By setting this parameter as `YES`, the IO layers are reshaped to the dimensions provided in
model configuration. By default, the dimensions in the model is used.
+* `TARGET_DEVICE`: Choose the OpenVINO device for running the inference. It could be CPU (default), GPU, NPU or any of the virtual devices like AUTO, MULTI, HETERO.
## Auto-Complete Model Configuration
@@ -228,6 +251,55 @@ string_value:"yes"
}
}
```
+### Running the models on Intel GPU
+
+Add to your config.pbtxt a parameter `TARGET_DEVICE`:
+```
+parameters: [
+{
+ key: "NUM_STREAMS"
+ value: {
+ string_value: "1"
+ }
+},
+{
+ key: "PERFORMANCE_HINT"
+ value: {
+ string_value: "THROUGHPUT"
+ }
+},
+{
+ key: "TARGET_DEVICE"
+ value: {
+ string_value: "GPU"
+ }
+}
+]
+```
+
+Start the container with extra parameter to pass the device `/dev/dri`:
+```
+docker run -it --rm --device /dev/dri --group-add=$(stat -c "%g" /dev/dri/render* | head -n 1 ) tritonserver:xpu
+```
+
+### Running the models on Intel NPU
+
+Add to your config.pbtxt a parameter `TARGET_DEVICE`:
+```
+parameters: [
+{
+ key: "TARGET_DEVICE"
+ value: {
+ string_value: "NPU"
+ }
+}
+]
+```
+
+Start the container with extra parameter to pass the device `/dev/accel`:
+```
+docker run -it --rm --device --device /dev/accel --group-add=$(stat -c "%g" /dev/dri/render* | head -n 1) tritonserver:xpu
+```
Check also the [Quick deploy guide](https://github.com/triton-inference-server/tutorials/tree/main/Quick_Deploy/OpenVINO).
@@ -236,5 +308,7 @@ Examples of the supported models and configs are included in the [functional tes
## Known Issues
* Models with the scalar on the input (shape without any dimension are not supported)
+
* Reshaping using [dimension ranges](https://docs.openvino.ai/2025/openvino-workflow/model-server/ovms_docs_dynamic_shape_dynamic_model.html) is not supported.
+
* Models without output names are not supported. Models must be saved with names assigned.
diff --git a/src/openvino.cc b/src/openvino.cc
index 601d88f..8706f3c 100644
--- a/src/openvino.cc
+++ b/src/openvino.cc
@@ -84,6 +84,9 @@ class ModelState : public BackendModel {
TRITONSERVER_Error* ParseParameter(
const std::string& mkey, triton::common::TritonJson::Value& params,
std::vector>* device_config);
+ TRITONSERVER_Error* ParseStringParameter(
+ const std::string& mkey, triton::common::TritonJson::Value& params,
+ std::string* value);
TRITONSERVER_Error* ParseParameterHelper(
const std::string& mkey, std::string* value,
std::pair* ov_property);
@@ -118,6 +121,7 @@ class ModelState : public BackendModel {
bool SkipDynamicBatchSize() { return skip_dynamic_batchsize_; }
bool EnableBatchPadding() { return enable_padding_; }
+ std::string TargetDevice() { return target_device_; }
private:
ModelState(TRITONBACKEND_Model* triton_model);
@@ -140,6 +144,7 @@ class ModelState : public BackendModel {
bool skip_dynamic_batchsize_;
bool enable_padding_;
bool reshape_io_layers_;
+ std::string target_device_;
};
TRITONSERVER_Error*
@@ -179,7 +184,7 @@ ModelState::Create(TRITONBACKEND_Model* triton_model, ModelState** state)
ModelState::ModelState(TRITONBACKEND_Model* triton_model)
: BackendModel(triton_model), model_read_(false),
skip_dynamic_batchsize_(false), enable_padding_(false),
- reshape_io_layers_(false)
+ reshape_io_layers_(false), target_device_("CPU")
{
}
@@ -238,12 +243,11 @@ ModelState::ParseParameters()
bool status = model_config_.Find("parameters", ¶ms);
if (status) {
RETURN_IF_ERROR(LoadCpuExtensions(params));
- RETURN_IF_ERROR(ParseBoolParameter(
- "SKIP_OV_DYNAMIC_BATCHSIZE", params, &skip_dynamic_batchsize_));
- RETURN_IF_ERROR(
- ParseBoolParameter("ENABLE_BATCH_PADDING", params, &enable_padding_));
- RETURN_IF_ERROR(
- ParseBoolParameter("RESHAPE_IO_LAYERS", params, &reshape_io_layers_));
+ ParseBoolParameter(
+ "SKIP_OV_DYNAMIC_BATCHSIZE", params, &skip_dynamic_batchsize_);
+ ParseBoolParameter("ENABLE_BATCH_PADDING", params, &enable_padding_);
+ ParseBoolParameter("RESHAPE_IO_LAYERS", params, &reshape_io_layers_);
+ ParseStringParameter("TARGET_DEVICE", params, &target_device_);
}
return nullptr;
@@ -256,18 +260,15 @@ ModelState::ParseParameters(const std::string& device)
triton::common::TritonJson::Value params;
bool status = model_config_.Find("parameters", ¶ms);
if (status) {
- if (device == "CPU") {
- config_[device] = {};
- auto& device_config = config_.at(device);
- RETURN_IF_ERROR(
- ParseParameter("INFERENCE_NUM_THREADS", params, &device_config));
- RETURN_IF_ERROR(
- ParseParameter("COMPILATION_NUM_THREADS", params, &device_config));
- RETURN_IF_ERROR(ParseParameter("HINT_BF16", params, &device_config));
- RETURN_IF_ERROR(ParseParameter("NUM_STREAMS", params, &device_config));
- RETURN_IF_ERROR(
- ParseParameter("PERFORMANCE_HINT", params, &device_config));
- }
+ config_[device] = {};
+ auto& device_config = config_.at(device);
+ RETURN_IF_ERROR(
+ ParseParameter("INFERENCE_NUM_THREADS", params, &device_config));
+ RETURN_IF_ERROR(
+ ParseParameter("COMPILATION_NUM_THREADS", params, &device_config));
+ RETURN_IF_ERROR(ParseParameter("HINT_BF16", params, &device_config));
+ RETURN_IF_ERROR(ParseParameter("NUM_STREAMS", params, &device_config));
+ RETURN_IF_ERROR(ParseParameter("PERFORMANCE_HINT", params, &device_config));
}
return nullptr;
@@ -277,9 +278,8 @@ TRITONSERVER_Error*
ModelState::LoadCpuExtensions(triton::common::TritonJson::Value& params)
{
std::string cpu_ext_path;
- LOG_IF_ERROR(
- ReadParameter(params, "CPU_EXTENSION_PATH", &(cpu_ext_path)),
- "error when reading parameters");
+ RETURN_IF_ERROR(
+ ReadParameter(params, "CPU_EXTENSION_PATH", &(cpu_ext_path), ""));
if (!cpu_ext_path.empty()) {
// CPU (MKLDNN) extensions is loaded as a shared library and passed as a
// pointer to base extension
@@ -287,8 +287,7 @@ ModelState::LoadCpuExtensions(triton::common::TritonJson::Value& params)
ov_core_.add_extension(cpu_ext_path), " loading custom CPU extensions");
LOG_MESSAGE(
TRITONSERVER_LOG_INFO,
- (std::string("CPU (MKLDNN) extensions is loaded") + cpu_ext_path)
- .c_str());
+ (std::string("CPU extensions is loaded") + cpu_ext_path).c_str());
}
return nullptr;
@@ -301,8 +300,7 @@ ModelState::ParseBoolParameter(
bool* setting)
{
std::string value;
- LOG_IF_ERROR(
- ReadParameter(params, mkey, &(value)), "error when reading parameters");
+ RETURN_IF_ERROR(ReadParameter(params, mkey, &(value), ""));
std::transform(
value.begin(), value.end(), value.begin(),
[](unsigned char c) { return std::tolower(c); });
@@ -313,14 +311,30 @@ ModelState::ParseBoolParameter(
return nullptr;
}
+TRITONSERVER_Error*
+ModelState::ParseStringParameter(
+ const std::string& mkey, triton::common::TritonJson::Value& params,
+ std::string* setting)
+{
+ std::string value;
+ RETURN_IF_ERROR(ReadParameter(params, mkey, &(value), ""));
+ std::transform(
+ value.begin(), value.end(), value.begin(),
+ [](unsigned char c) { return std::toupper(c); });
+ if (value.length() > 0) {
+ *setting = value;
+ }
+
+ return nullptr;
+}
+
TRITONSERVER_Error*
ModelState::ParseParameter(
const std::string& mkey, triton::common::TritonJson::Value& params,
std::vector>* device_config)
{
std::string value;
- LOG_IF_ERROR(
- ReadParameter(params, mkey, &(value)), "error when reading parameters");
+ RETURN_IF_ERROR(ReadParameter(params, mkey, &(value), ""));
if (!value.empty()) {
std::pair ov_property;
RETURN_IF_ERROR(ParseParameterHelper(mkey, &value, &ov_property));
@@ -410,6 +424,16 @@ ModelState::ParseParameterHelper(
TRITONSERVER_Error*
ModelState::ConfigureOpenvinoCore()
{
+ auto availableDevices = ov_core_.get_available_devices();
+ std::stringstream list_of_devices;
+
+ for (auto& element : availableDevices) {
+ list_of_devices << element << ",";
+ }
+ LOG_MESSAGE(
+ TRITONSERVER_LOG_VERBOSE,
+ (std::string("Available OpenVINO devices: " + list_of_devices.str()))
+ .c_str());
for (auto&& item : config_) {
std::string device_name = item.first;
std::vector> properties = item.second;
@@ -438,9 +462,10 @@ ModelState::LoadModel(
std::to_string(OPENVINO_VERSION_MINOR) + "." +
std::to_string(OPENVINO_VERSION_PATCH))
.c_str());
+
LOG_MESSAGE(
TRITONSERVER_LOG_VERBOSE,
- (std::string("Device info: \n") +
+ (std::string("Device info: ") +
ConvertVersionMapToString(ov_core_.get_versions(device)))
.c_str());
@@ -932,19 +957,26 @@ ModelInstanceState::Create(
ModelInstanceState::ModelInstanceState(
ModelState* model_state, TRITONBACKEND_ModelInstance* triton_model_instance)
: BackendModelInstance(model_state, triton_model_instance),
- model_state_(model_state), device_("CPU"), batch_pad_size_(0)
+ model_state_(model_state), device_(model_state->TargetDevice()),
+ batch_pad_size_(0)
{
if (Kind() != TRITONSERVER_INSTANCEGROUPKIND_CPU) {
throw triton::backend::BackendModelInstanceException(TRITONSERVER_ErrorNew(
TRITONSERVER_ERROR_INVALID_ARG,
(std::string("unable to load model '") + model_state_->Name() +
- "', Triton openVINO backend supports only CPU device")
+ "', Triton OpenVINO backend supports only Kind CPU and AUTO")
.c_str()));
}
if (model_state_->ModelNotRead()) {
std::string model_path;
THROW_IF_BACKEND_INSTANCE_ERROR(model_state_->ParseParameters());
+ device_ = model_state->TargetDevice();
+ LOG_MESSAGE(
+ TRITONSERVER_LOG_INFO,
+ (std::string("Target device " + device_)).c_str());
+
+
THROW_IF_BACKEND_INSTANCE_ERROR(
model_state_->ReadModel(ArtifactFilename(), &model_path));
THROW_IF_BACKEND_INSTANCE_ERROR(model_state_->ValidateConfigureModel());
@@ -1521,8 +1553,7 @@ TRITONBACKEND_ModelInstanceInitialize(TRITONBACKEND_ModelInstance* instance)
LOG_MESSAGE(
TRITONSERVER_LOG_INFO,
(std::string("TRITONBACKEND_ModelInstanceInitialize: ") + name + " (" +
- TRITONSERVER_InstanceGroupKindString(kind) + " device " +
- std::to_string(device_id) + ")")
+ TRITONSERVER_InstanceGroupKindString(kind) + ")")
.c_str());
// Get the model state associated with this instance's model.
diff --git a/src/openvino_utils.cc b/src/openvino_utils.cc
index 72b8d6b..e6edefe 100644
--- a/src/openvino_utils.cc
+++ b/src/openvino_utils.cc
@@ -275,13 +275,14 @@ CompareDimsSupported(
TRITONSERVER_Error*
ReadParameter(
triton::common::TritonJson::Value& params, const std::string& key,
- std::string* param)
+ std::string* param, const std::string default_value)
{
triton::common::TritonJson::Value value;
- RETURN_ERROR_IF_FALSE(
- params.Find(key.c_str(), &value), TRITONSERVER_ERROR_INVALID_ARG,
- std::string("model configuration is missing the parameter ") + key);
- RETURN_IF_ERROR(value.MemberAsString("string_value", param));
+ if (params.Find(key.c_str(), &value)) {
+ RETURN_IF_ERROR(value.MemberAsString("string_value", param));
+ } else {
+ *param = default_value;
+ }
return nullptr; // success
}
diff --git a/src/openvino_utils.h b/src/openvino_utils.h
index 2fbaadb..a894937 100644
--- a/src/openvino_utils.h
+++ b/src/openvino_utils.h
@@ -97,7 +97,7 @@ TRITONSERVER_Error* CompareDimsSupported(
TRITONSERVER_Error* ReadParameter(
triton::common::TritonJson::Value& params, const std::string& key,
- std::string* param);
+ std::string* param, const std::string default_value);
std::vector ConvertToSignedShape(const ov::PartialShape& shape);
diff --git a/tests/README.md b/tests/README.md
index 5a7e6f9..a1ec7b0 100644
--- a/tests/README.md
+++ b/tests/README.md
@@ -3,16 +3,17 @@
Installing and running tests
```bash
pip install -r requirements.txt
-pytest
+pytest -sv --image=tritonserver:latest
```
Running tests with gpu
```bash
-pytest --gpu
+pytest -sv --gpu --image=tritonserver:latest
```
Run tests while caching downloaded models
```bash
-pytest --model-cache ./cache
+mkdir cache
+pytest -sv --model-cache ./cache --image=tritonserver:latest
```
diff --git a/tools/gen_openvino_dockerfile.py b/tools/gen_openvino_dockerfile.py
index bff8f47..0d31784 100755
--- a/tools/gen_openvino_dockerfile.py
+++ b/tools/gen_openvino_dockerfile.py
@@ -124,8 +124,8 @@ def dockerfile_for_linux(output_file):
RUN mkdir -p include && \
cp -r /workspace/install/runtime/include/* include/.
RUN mkdir -p lib && \
- cp -P /workspace/install/runtime/lib/intel64/*.so* lib/. && \
- cp -P /workspace/install/runtime/3rdparty/tbb/lib/libtbb.so* lib/. \
+ cp -P /workspace/install/runtime/3rdparty/tbb/lib/libtbb.so* lib/. && \
+ cp -P /workspace/install/runtime/lib/intel64/libopenvino*.so* lib/.
"""
df += """