diff --git a/.github/workflows/test_precommit.yml b/.github/workflows/test_precommit.yml index 0a042a9b..b1509ba1 100644 --- a/.github/workflows/test_precommit.yml +++ b/.github/workflows/test_precommit.yml @@ -146,6 +146,6 @@ jobs: python -m pip install --upgrade pip python -m pip install model_api/python/[ovms,tests] python -c "from model_api.models import DetectionModel; DetectionModel.create_model('ssd_mobilenet_v1_fpn_coco').save('ovms_models/ssd_mobilenet_v1_fpn_coco/1/ssd_mobilenet_v1_fpn_coco.xml')" - docker run -d --rm -v $GITHUB_WORKSPACE/ovms_models/:/models -p 9000:9000 -p 8000:8000 openvino/model_server:latest --model_path /models/ssd_mobilenet_v1_fpn_coco/ --model_name ssd_mobilenet_v1_fpn_coco --port 9000 --rest_port 8000 --log_level DEBUG --target_device CPU + docker run -d --rm -v $GITHUB_WORKSPACE/ovms_models/:/models -p 8000:8000 openvino/model_server:latest --model_path /models/ssd_mobilenet_v1_fpn_coco/ --model_name ssd_mobilenet_v1_fpn_coco --rest_port 8000 --log_level DEBUG --target_device CPU python tests/cpp/precommit/prepare_data.py -d data -p tests/cpp/precommit/public_scope.json python examples/python/serving_api/run.py data/coco128/images/train2017/000000000009.jpg # detects 4 objects diff --git a/examples/python/serving_api/README.md b/examples/python/serving_api/README.md index 3167a6ce..3049b397 100644 --- a/examples/python/serving_api/README.md +++ b/examples/python/serving_api/README.md @@ -28,7 +28,7 @@ This example demonstrates how to use a Python API of OpenVINO Model API for a re - Run docker with OVMS server: ```bash - docker run -d -v /home/user/models:/models -p 9000:9000 openvino/model_server:latest --model_path /models/ssd_mobilenet_v1_fpn_coco --model_name ssd_mobilenet_v1_fpn_coco --port 9000 --shape auto --nireq 4 --target_device CPU --plugin_config "{\"CPU_THROUGHPUT_STREAMS\": \"CPU_THROUGHPUT_AUTO\"}" + docker run -d -v /home/user/models:/models -p 8000:8000 openvino/model_server:latest --model_path /models/ssd_mobilenet_v1_fpn_coco --model_name ssd_mobilenet_v1_fpn_coco --rest_port 8000 --nireq 4 --target_device CPU ``` ## Run example diff --git a/examples/python/serving_api/run.py b/examples/python/serving_api/run.py index 9f1850f7..3c28b178 100755 --- a/examples/python/serving_api/run.py +++ b/examples/python/serving_api/run.py @@ -20,7 +20,7 @@ def main(): # Create Object Detection model specifying the OVMS server URL model = DetectionModel.create_model( - "localhost:9000/models/ssd_mobilenet_v1_fpn_coco", model_type="ssd" + "localhost:8000/v2/models/ssd_mobilenet_v1_fpn_coco", model_type="ssd" ) detections = model(image) print(f"Detection results: {detections}") diff --git a/model_api/python/model_api/adapters/ovms_adapter.md b/model_api/python/model_api/adapters/ovms_adapter.md index a0bf4812..f71c4323 100644 --- a/model_api/python/model_api/adapters/ovms_adapter.md +++ b/model_api/python/model_api/adapters/ovms_adapter.md @@ -7,7 +7,7 @@ The `OVMSAdapter` implements `InferenceAdapter` interface. The `OVMSAdapter` mak `OVMSAdapter` enables inference via gRPC calls to OpenVINO Model Server, so in order to use it you need two things: - OpenVINO Model Server that serves your model -- [`ovmsclient`](https://pypi.org/project/ovmsclient/) package installed to enable communication with the model server: `python3 -m pip install ovmsclient` +- [`tritonclient[http]`](https://pypi.org/project/tritonclient/) package installed to enable communication with the model server: `python3 -m pip install tritonclient[http]` ### Deploy OpenVINO Model Server @@ -15,7 +15,7 @@ Model Server is distributed as a docker image and it's available in DockerHub, s ## Model configuration -When using OpenVINO Model Server model cannot be directly accessed from the client application (like OMZ demos). Therefore any configuration must be done on model server side or before starting the server: see [Prepare a model for `InferenceAdapter`](../../../../../README.md#prepare-a-model-for-inferenceadapter). +When using OpenVINO Model Server model cannot be directly accessed from the client application. Therefore any configuration must be done on model server side or before starting the server: see [Prepare a model for `InferenceAdapter`](../../../../../README.md#prepare-a-model-for-inferenceadapter). ### Input reshaping @@ -51,8 +51,8 @@ To run the demo with model served in OpenVINO Model Server, you would have to pr Assuming that model server runs on the same machine as the demo, exposes gRPC service on port 9000 and serves model called `model1`, the value of `-m` parameter would be: -- `localhost:9000/models/model1` - requesting latest model version -- `localhost:9000/models/model1:2` - requesting model version number 2 +- `localhost:9000/v2/models/model1` - requesting latest model version +- `localhost:9000/v2/models/model1:2` - requesting model version number 2 ## See Also diff --git a/model_api/python/model_api/adapters/ovms_adapter.py b/model_api/python/model_api/adapters/ovms_adapter.py index e86bba0c..603552a0 100644 --- a/model_api/python/model_api/adapters/ovms_adapter.py +++ b/model_api/python/model_api/adapters/ovms_adapter.py @@ -11,75 +11,104 @@ import numpy as np from .inference_adapter import InferenceAdapter, Metadata -from .utils import Layout +from .utils import Layout, get_rt_info_from_dict class OVMSAdapter(InferenceAdapter): - """Class that allows working with models served by the OpenVINO Model Server""" + """Inference adapter that allows working with models served by the OpenVINO Model Server""" def __init__(self, target_model: str): - """Expected format:
:/models/[:]""" - import ovmsclient + """ + Initializes OVMS adapter. + + Args: + target_model (str): Model URL. Expected format:
:/v2/models/[:] + """ + import tritonclient.http as httpclient service_url, self.model_name, self.model_version = _parse_model_arg( target_model, ) - self.client = ovmsclient.make_grpc_client(url=service_url) - _verify_model_available(self.client, self.model_name, self.model_version) + self.client = httpclient.InferenceServerClient(service_url) + if not self.client.is_model_ready(self.model_name, self.model_version): + msg = f"Requested model: {self.model_name}, version: {self.model_version} is not accessible" + raise RuntimeError(msg) self.metadata = self.client.get_model_metadata( model_name=self.model_name, model_version=self.model_version, ) + self.inputs = self.get_input_layers() + + def get_input_layers(self) -> dict[str, Metadata]: + """ + Retrieves information about remote model's inputs. - def get_input_layers(self): + Returns: + dict[str, Metadata]: metadata for each input. + """ return { - name: Metadata( - {name}, + meta["name"]: Metadata( + {meta["name"]}, meta["shape"], Layout.from_shape(meta["shape"]), - _tf2ov_precision.get(meta["dtype"], meta["dtype"]), + meta["datatype"], ) - for name, meta in self.metadata["inputs"].items() + for meta in self.metadata["inputs"] } - def get_output_layers(self): + def get_output_layers(self) -> dict[str, Metadata]: + """ + Retrieves information about remote model's outputs. + + Returns: + dict[str, Metadata]: metadata for each output. + """ return { - name: Metadata( - {name}, + meta["name"]: Metadata( + {meta["name"]}, shape=meta["shape"], - precision=_tf2ov_precision.get(meta["dtype"], meta["dtype"]), + precision=meta["datatype"], ) - for name, meta in self.metadata["outputs"].items() + for meta in self.metadata["outputs"] } - def infer_sync(self, dict_data): - inputs = _prepare_inputs(dict_data, self.metadata["inputs"]) - raw_result = self.client.predict( - inputs, + def infer_sync(self, dict_data: dict) -> dict: + """ + Performs the synchronous model inference. The infer is a blocking method. + + Args: + dict_data (dict): data for each input layer. + + Returns: + dict: model raw outputs. + """ + inputs = _prepare_inputs(dict_data, self.inputs) + raw_result = self.client.infer( model_name=self.model_name, model_version=self.model_version, + inputs=inputs, ) - # For models with single output ovmsclient returns ndarray with results, - # so the dict must be created to correctly implement interface. - if isinstance(raw_result, np.ndarray): - output_name = next(iter(self.metadata["outputs"].keys())) - return {output_name: raw_result} - return raw_result - - def infer_async(self, dict_data, callback_data): - inputs = _prepare_inputs(dict_data, self.metadata["inputs"]) - raw_result = self.client.predict( - inputs, + + inference_results = {} + for output in self.metadata["outputs"]: + inference_results[output["name"]] = raw_result.as_numpy(output["name"]) + + return inference_results + + def infer_async(self, dict_data: dict, callback_data: Any): + """A stub method imitating async inference with a blocking call.""" + inputs = _prepare_inputs(dict_data, self.inputs) + raw_result = self.client.infer( model_name=self.model_name, model_version=self.model_version, + inputs=inputs, ) - # For models with single output ovmsclient returns ndarray with results, - # so the dict must be created to correctly implement interface. - if isinstance(raw_result, np.ndarray): - output_name = next(iter(self.metadata["outputs"].keys())) - raw_result = {output_name: raw_result} - self.callback_fn(raw_result, (lambda x: x, callback_data)) + inference_results = {} + for output in self.metadata["outputs"]: + inference_results[output["name"]] = raw_result.as_numpy(output["name"]) + + self.callback_fn(inference_results, (lambda x: x, callback_data)) def set_callback(self, callback_fn: Callable): self.callback_fn = callback_fn @@ -118,97 +147,84 @@ def embed_preprocessing( ): pass - def reshape_model(self, new_shape): - raise NotImplementedError - - def get_rt_info(self, path): - msg = "OVMSAdapter does not support RT info getting" + def reshape_model(self, new_shape: dict): + """OVMS adapter can not modify the remote model. This method raises an exception.""" + msg = "OVMSAdapter does not support model reshaping" raise NotImplementedError(msg) + def get_rt_info(self, path: list[str]) -> Any: + """Returns an attribute stored in model info.""" + return get_rt_info_from_dict(self.metadata["rt_info"], path) + def update_model_info(self, model_info: dict[str, Any]): + """OVMS adapter can not update the source model info. This method raises an exception.""" msg = "OVMSAdapter does not support updating model info" raise NotImplementedError(msg) def save_model(self, path: str, weights_path: str | None = None, version: str | None = None): + """OVMS adapter can not retrieve the source model. This method raises an exception.""" msg = "OVMSAdapter does not support saving a model" raise NotImplementedError(msg) -_tf2ov_precision = { - "DT_INT64": "I64", - "DT_UINT64": "U64", - "DT_FLOAT": "FP32", - "DT_UINT32": "U32", - "DT_INT32": "I32", - "DT_HALF": "FP16", - "DT_INT16": "I16", - "DT_INT8": "I8", - "DT_UINT8": "U8", -} - - -_tf2np_precision = { - "DT_INT64": np.int64, - "DT_UINT64": np.uint64, - "DT_FLOAT": np.float32, - "DT_UINT32": np.uint32, - "DT_INT32": np.int32, - "DT_HALF": np.float16, - "DT_INT16": np.int16, - "DT_INT8": np.int8, - "DT_UINT8": np.uint8, +_triton2np_precision = { + "INT64": np.int64, + "UINT64": np.uint64, + "FLOAT": np.float32, + "UINT32": np.uint32, + "INT32": np.int32, + "HALF": np.float16, + "INT16": np.int16, + "INT8": np.int8, + "UINT8": np.uint8, + "FP32": np.float32, } def _parse_model_arg(target_model: str): + """Parses OVMS model URL.""" if not isinstance(target_model, str): msg = "target_model must be str" raise TypeError(msg) # Expected format:
:/models/[:] if not re.fullmatch( - r"(\w+\.*\-*)*\w+:\d+\/models\/[a-zA-Z0-9._-]+(\:\d+)*", + r"(\w+\.*\-*)*\w+:\d+\/v2/models\/[a-zA-Z0-9._-]+(\:\d+)*", target_model, ): msg = "invalid --model option format" raise ValueError(msg) - service_url, _, model = target_model.split("/") + service_url, _, _, model = target_model.split("/") model_spec = model.split(":") if len(model_spec) == 1: # model version not specified - use latest - return service_url, model_spec[0], 0 + return service_url, model_spec[0], "" if len(model_spec) == 2: - return service_url, model_spec[0], int(model_spec[1]) - msg = "invalid target_model format" + return service_url, model_spec[0], model_spec[1] + msg = "Invalid target_model format" raise ValueError(msg) -def _verify_model_available(client, model_name, model_version): - import ovmsclient +def _prepare_inputs(dict_data: dict, inputs_meta: dict[str, Metadata]): + """Converts raw model inputs into OVMS-specific representation.""" + import tritonclient.http as httpclient - version = "latest" if model_version == 0 else model_version - try: - model_status = client.get_model_status(model_name, model_version) - except ovmsclient.ModelNotFoundError as e: - msg = f"Requested model: {model_name}, version: {version} has not been found" - raise RuntimeError(msg) from e - target_version = max(model_status.keys()) - version_status = model_status[target_version] - if version_status["state"] != "AVAILABLE" or version_status["error_code"] != 0: - msg = f"Requested model: {model_name}, version: {version} is not in available state" - raise RuntimeError(msg) - - -def _prepare_inputs(dict_data, inputs_meta): - inputs = {} + inputs = [] for input_name, input_data in dict_data.items(): if input_name not in inputs_meta: msg = "Input data does not match model inputs" raise ValueError(msg) input_info = inputs_meta[input_name] - model_precision = _tf2np_precision[input_info["dtype"]] + model_precision = _triton2np_precision[input_info.precision] if isinstance(input_data, np.ndarray) and input_data.dtype != model_precision: input_data = input_data.astype(model_precision) elif isinstance(input_data, list): input_data = np.array(input_data, dtype=model_precision) - inputs[input_name] = input_data + + infer_input = httpclient.InferInput( + input_name, + input_data.shape, + input_info.precision, + ) + infer_input.set_data_from_numpy(input_data) + inputs.append(infer_input) return inputs diff --git a/model_api/python/model_api/models/model.py b/model_api/python/model_api/models/model.py index b301b98c..88bb13a9 100644 --- a/model_api/python/model_api/models/model.py +++ b/model_api/python/model_api/models/model.py @@ -175,7 +175,7 @@ def create_model( if isinstance(model, InferenceAdapter): inference_adapter = model elif isinstance(model, str) and re.compile( - r"(\w+\.*\-*)*\w+:\d+\/models\/[a-zA-Z0-9._-]+(\:\d+)*", + r"(\w+\.*\-*)*\w+:\d+\/v2/models\/[a-zA-Z0-9._-]+(\:\d+)*", ).fullmatch(model): inference_adapter = OVMSAdapter(model) else: @@ -268,8 +268,7 @@ def _load_config(self, config: dict[str, Any]) -> None: self.__setattr__(name, value) except RuntimeError as error: missing_rt_info = "Cannot get runtime attribute. Path to runtime attribute is incorrect." in str(error) - is_OVMSAdapter = str(error) == "OVMSAdapter does not support RT info getting" - if not missing_rt_info and not is_OVMSAdapter: + if not missing_rt_info: raise for name, value in config.items(): diff --git a/model_api/python/pyproject.toml b/model_api/python/pyproject.toml index 88d71fae..11d8b2d0 100644 --- a/model_api/python/pyproject.toml +++ b/model_api/python/pyproject.toml @@ -34,7 +34,7 @@ dependencies = [ [project.optional-dependencies] ovms = [ - "ovmsclient", + "tritonclient[http]", ] tests = [ "pre-commit",