From 8417e8176cc8c8d99f4e3b4996882cdc3923705c Mon Sep 17 00:00:00 2001 From: Vladislav Sovrasov Date: Sat, 19 Oct 2024 07:23:33 +0900 Subject: [PATCH 1/6] Switch to tritonclient in OVMS adapter --- .../python/model_api/adapters/ovms_adapter.py | 137 ++++++++---------- model_api/python/model_api/models/model.py | 5 +- model_api/python/pyproject.toml | 2 +- 3 files changed, 60 insertions(+), 84 deletions(-) diff --git a/model_api/python/model_api/adapters/ovms_adapter.py b/model_api/python/model_api/adapters/ovms_adapter.py index 509b2ba9..8ba5359f 100644 --- a/model_api/python/model_api/adapters/ovms_adapter.py +++ b/model_api/python/model_api/adapters/ovms_adapter.py @@ -19,7 +19,7 @@ import numpy as np from .inference_adapter import InferenceAdapter, Metadata -from .utils import Layout +from .utils import Layout, get_rt_info_from_dict class OVMSAdapter(InferenceAdapter): @@ -29,62 +29,65 @@ class OVMSAdapter(InferenceAdapter): def __init__(self, target_model: str): """Expected format:
:/models/[:]""" - import ovmsclient + import tritonclient.http as httpclient service_url, self.model_name, self.model_version = _parse_model_arg( target_model ) - self.client = ovmsclient.make_grpc_client(url=service_url) - _verify_model_available(self.client, self.model_name, self.model_version) + self.client = httpclient.InferenceServerClient(service_url) + if not self.client.is_model_ready(self.model_name, self.model_version): + raise RuntimeError( + f"Requested model: {self.model_name}, version: {self.model_version} is not accessible" + ) self.metadata = self.client.get_model_metadata( model_name=self.model_name, model_version=self.model_version ) + self.inputs = self.get_input_layers() def get_input_layers(self): return { - name: Metadata( - {name}, + meta["name"]: Metadata( + {meta["name"]}, meta["shape"], Layout.from_shape(meta["shape"]), - _tf2ov_precision.get(meta["dtype"], meta["dtype"]), + meta["datatype"], ) - for name, meta in self.metadata["inputs"].items() + for meta in self.metadata["inputs"] } def get_output_layers(self): return { - name: Metadata( - {name}, + meta["name"]: Metadata( + {meta["name"]}, shape=meta["shape"], - precision=_tf2ov_precision.get(meta["dtype"], meta["dtype"]), + precision=meta["datatype"], ) - for name, meta in self.metadata["outputs"].items() + for meta in self.metadata["outputs"] } def infer_sync(self, dict_data): - inputs = _prepare_inputs(dict_data, self.metadata["inputs"]) - raw_result = self.client.predict( - inputs, model_name=self.model_name, model_version=self.model_version + inputs = _prepare_inputs(dict_data, self.inputs) + raw_result = self.client.infer( + model_name=self.model_name, model_version=self.model_version, inputs=inputs ) - # For models with single output ovmsclient returns ndarray with results, - # so the dict must be created to correctly implement interface. - if isinstance(raw_result, np.ndarray): - output_name = next(iter((self.metadata["outputs"].keys()))) - return {output_name: raw_result} - return raw_result + + inference_results = {} + for output in self.metadata["outputs"]: + inference_results[output["name"]] = raw_result.as_numpy(output["name"]) + + return inference_results def infer_async(self, dict_data, callback_data): - inputs = _prepare_inputs(dict_data, self.metadata["inputs"]) - raw_result = self.client.predict( - inputs, model_name=self.model_name, model_version=self.model_version + inputs = _prepare_inputs(dict_data, self.inputs) + raw_result = self.client.infer( + model_name=self.model_name, model_version=self.model_version, inputs=inputs ) - # For models with single output ovmsclient returns ndarray with results, - # so the dict must be created to correctly implement interface. - if isinstance(raw_result, np.ndarray): - output_name = list(self.metadata["outputs"].keys())[0] - raw_result = {output_name: raw_result} - self.callback_fn(raw_result, (lambda x: x, callback_data)) + inference_results = {} + for output in self.metadata["outputs"]: + inference_results[output["name"]] = raw_result.as_numpy(output["name"]) + + self.callback_fn(inference_results, (lambda x: x, callback_data)) def set_callback(self, callback_fn): self.callback_fn = callback_fn @@ -120,32 +123,19 @@ def reshape_model(self, new_shape): raise NotImplementedError def get_rt_info(self, path): - raise NotImplementedError("OVMSAdapter does not support RT info getting") - - -_tf2ov_precision = { - "DT_INT64": "I64", - "DT_UINT64": "U64", - "DT_FLOAT": "FP32", - "DT_UINT32": "U32", - "DT_INT32": "I32", - "DT_HALF": "FP16", - "DT_INT16": "I16", - "DT_INT8": "I8", - "DT_UINT8": "U8", -} - - -_tf2np_precision = { - "DT_INT64": np.int64, - "DT_UINT64": np.uint64, - "DT_FLOAT": np.float32, - "DT_UINT32": np.uint32, - "DT_INT32": np.int32, - "DT_HALF": np.float16, - "DT_INT16": np.int16, - "DT_INT8": np.int8, - "DT_UINT8": np.uint8, + return get_rt_info_from_dict(self.metadata["rt_info"], path) + + +_triton2np_precision = { + "INT64": np.int64, + "UINT64": np.uint64, + "FLOAT": np.float32, + "UINT32": np.uint32, + "INT32": np.int32, + "HALF": np.float16, + "INT16": np.int16, + "INT8": np.int8, + "UINT8": np.uint8, } @@ -161,40 +151,29 @@ def _parse_model_arg(target_model: str): model_spec = model.split(":") if len(model_spec) == 1: # model version not specified - use latest - return service_url, model_spec[0], 0 + return service_url, model_spec[0], "" if len(model_spec) == 2: - return service_url, model_spec[0], int(model_spec[1]) + return service_url, model_spec[0], model_spec[1] raise ValueError("invalid target_model format") -def _verify_model_available(client, model_name, model_version): - import ovmsclient - - version = "latest" if model_version == 0 else model_version - try: - model_status = client.get_model_status(model_name, model_version) - except ovmsclient.ModelNotFoundError as e: - raise RuntimeError( - f"Requested model: {model_name}, version: {version} has not been found" - ) from e - target_version = max(model_status.keys()) - version_status = model_status[target_version] - if version_status["state"] != "AVAILABLE" or version_status["error_code"] != 0: - raise RuntimeError( - f"Requested model: {model_name}, version: {version} is not in available state" - ) - - def _prepare_inputs(dict_data, inputs_meta): - inputs = {} + import tritonclient.http as httpclient + + inputs = [] for input_name, input_data in dict_data.items(): if input_name not in inputs_meta.keys(): raise ValueError("Input data does not match model inputs") input_info = inputs_meta[input_name] - model_precision = _tf2np_precision[input_info["dtype"]] + model_precision = _triton2np_precision[input_info.precision] if isinstance(input_data, np.ndarray) and input_data.dtype != model_precision: input_data = input_data.astype(model_precision) elif isinstance(input_data, list): input_data = np.array(input_data, dtype=model_precision) - inputs[input_name] = input_data + + infer_input = httpclient.InferInput( + input_name, input_data.shape, input_info.precision + ) + infer_input.set_data_from_numpy(input_data) + inputs.append(infer_input) return inputs diff --git a/model_api/python/model_api/models/model.py b/model_api/python/model_api/models/model.py index 78f7ce97..938ee385 100644 --- a/model_api/python/model_api/models/model.py +++ b/model_api/python/model_api/models/model.py @@ -268,10 +268,7 @@ def _load_config(self, config): "Cannot get runtime attribute. Path to runtime attribute is incorrect." in str(error) ) - is_OVMSAdapter = ( - str(error) == "OVMSAdapter does not support RT info getting" - ) - if not missing_rt_info and not is_OVMSAdapter: + if not missing_rt_info: raise for name, value in config.items(): diff --git a/model_api/python/pyproject.toml b/model_api/python/pyproject.toml index 341447b5..510246b2 100644 --- a/model_api/python/pyproject.toml +++ b/model_api/python/pyproject.toml @@ -33,7 +33,7 @@ dependencies = [ [project.optional-dependencies] ovms = [ - "ovmsclient", + "tritonclient[http]", ] tests = [ "httpx", From c2808636d03828b9209bcc1653e01240dbf8d318 Mon Sep 17 00:00:00 2001 From: Vladislav Sovrasov Date: Thu, 21 Nov 2024 10:00:50 +0900 Subject: [PATCH 2/6] Fix linter --- .../python/model_api/adapters/ovms_adapter.py | 20 ++++++++++++------- 1 file changed, 13 insertions(+), 7 deletions(-) diff --git a/model_api/python/model_api/adapters/ovms_adapter.py b/model_api/python/model_api/adapters/ovms_adapter.py index 81a67909..6de3eb6a 100644 --- a/model_api/python/model_api/adapters/ovms_adapter.py +++ b/model_api/python/model_api/adapters/ovms_adapter.py @@ -24,9 +24,8 @@ def __init__(self, target_model: str): ) self.client = httpclient.InferenceServerClient(service_url) if not self.client.is_model_ready(self.model_name, self.model_version): - raise RuntimeError( - f"Requested model: {self.model_name}, version: {self.model_version} is not accessible" - ) + msg = f"Requested model: {self.model_name}, version: {self.model_version} is not accessible" + raise RuntimeError(msg) self.metadata = self.client.get_model_metadata( model_name=self.model_name, @@ -58,7 +57,9 @@ def get_output_layers(self): def infer_sync(self, dict_data): inputs = _prepare_inputs(dict_data, self.inputs) raw_result = self.client.infer( - model_name=self.model_name, model_version=self.model_version, inputs=inputs + model_name=self.model_name, + model_version=self.model_version, + inputs=inputs, ) inference_results = {} @@ -70,7 +71,9 @@ def infer_sync(self, dict_data): def infer_async(self, dict_data, callback_data): inputs = _prepare_inputs(dict_data, self.inputs) raw_result = self.client.infer( - model_name=self.model_name, model_version=self.model_version, inputs=inputs + model_name=self.model_name, + model_version=self.model_version, + inputs=inputs, ) inference_results = {} for output in self.metadata["outputs"]: @@ -161,7 +164,8 @@ def _parse_model_arg(target_model: str): return service_url, model_spec[0], "" if len(model_spec) == 2: return service_url, model_spec[0], model_spec[1] - raise ValueError("invalid target_model format") + msg = "Invalid target_model format" + raise ValueError(msg) def _prepare_inputs(dict_data, inputs_meta): @@ -180,7 +184,9 @@ def _prepare_inputs(dict_data, inputs_meta): input_data = np.array(input_data, dtype=model_precision) infer_input = httpclient.InferInput( - input_name, input_data.shape, input_info.precision + input_name, + input_data.shape, + input_info.precision, ) infer_input.set_data_from_numpy(input_data) inputs.append(infer_input) From 2916a7c8fd1e596c1ed9bed2444abec18e24e33e Mon Sep 17 00:00:00 2001 From: Vladisalv Sovrasov Date: Sat, 14 Dec 2024 01:45:50 +0900 Subject: [PATCH 3/6] Update ovms docs --- model_api/python/model_api/adapters/ovms_adapter.md | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/model_api/python/model_api/adapters/ovms_adapter.md b/model_api/python/model_api/adapters/ovms_adapter.md index a0bf4812..f71c4323 100644 --- a/model_api/python/model_api/adapters/ovms_adapter.md +++ b/model_api/python/model_api/adapters/ovms_adapter.md @@ -7,7 +7,7 @@ The `OVMSAdapter` implements `InferenceAdapter` interface. The `OVMSAdapter` mak `OVMSAdapter` enables inference via gRPC calls to OpenVINO Model Server, so in order to use it you need two things: - OpenVINO Model Server that serves your model -- [`ovmsclient`](https://pypi.org/project/ovmsclient/) package installed to enable communication with the model server: `python3 -m pip install ovmsclient` +- [`tritonclient[http]`](https://pypi.org/project/tritonclient/) package installed to enable communication with the model server: `python3 -m pip install tritonclient[http]` ### Deploy OpenVINO Model Server @@ -15,7 +15,7 @@ Model Server is distributed as a docker image and it's available in DockerHub, s ## Model configuration -When using OpenVINO Model Server model cannot be directly accessed from the client application (like OMZ demos). Therefore any configuration must be done on model server side or before starting the server: see [Prepare a model for `InferenceAdapter`](../../../../../README.md#prepare-a-model-for-inferenceadapter). +When using OpenVINO Model Server model cannot be directly accessed from the client application. Therefore any configuration must be done on model server side or before starting the server: see [Prepare a model for `InferenceAdapter`](../../../../../README.md#prepare-a-model-for-inferenceadapter). ### Input reshaping @@ -51,8 +51,8 @@ To run the demo with model served in OpenVINO Model Server, you would have to pr Assuming that model server runs on the same machine as the demo, exposes gRPC service on port 9000 and serves model called `model1`, the value of `-m` parameter would be: -- `localhost:9000/models/model1` - requesting latest model version -- `localhost:9000/models/model1:2` - requesting model version number 2 +- `localhost:9000/v2/models/model1` - requesting latest model version +- `localhost:9000/v2/models/model1:2` - requesting model version number 2 ## See Also From c8b0a4b9fd84e28c1f7784a24ccf1bd84ab2eb0a Mon Sep 17 00:00:00 2001 From: Vladisalv Sovrasov Date: Sat, 14 Dec 2024 02:50:21 +0900 Subject: [PATCH 4/6] Update OVMS address --- model_api/python/model_api/adapters/ovms_adapter.py | 5 +++-- model_api/python/model_api/models/model.py | 2 +- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/model_api/python/model_api/adapters/ovms_adapter.py b/model_api/python/model_api/adapters/ovms_adapter.py index dcefe5da..07d07509 100644 --- a/model_api/python/model_api/adapters/ovms_adapter.py +++ b/model_api/python/model_api/adapters/ovms_adapter.py @@ -145,6 +145,7 @@ def save_model(self, path: str, weights_path: str | None = None, version: str | "INT16": np.int16, "INT8": np.int8, "UINT8": np.uint8, + "FP32": np.float32, } @@ -154,12 +155,12 @@ def _parse_model_arg(target_model: str): raise TypeError(msg) # Expected format:
:/models/[:] if not re.fullmatch( - r"(\w+\.*\-*)*\w+:\d+\/models\/[a-zA-Z0-9._-]+(\:\d+)*", + r"(\w+\.*\-*)*\w+:\d+\/v2/models\/[a-zA-Z0-9._-]+(\:\d+)*", target_model, ): msg = "invalid --model option format" raise ValueError(msg) - service_url, _, model = target_model.split("/") + service_url, _, _, model = target_model.split("/") model_spec = model.split(":") if len(model_spec) == 1: # model version not specified - use latest diff --git a/model_api/python/model_api/models/model.py b/model_api/python/model_api/models/model.py index f445e597..88bb13a9 100644 --- a/model_api/python/model_api/models/model.py +++ b/model_api/python/model_api/models/model.py @@ -175,7 +175,7 @@ def create_model( if isinstance(model, InferenceAdapter): inference_adapter = model elif isinstance(model, str) and re.compile( - r"(\w+\.*\-*)*\w+:\d+\/models\/[a-zA-Z0-9._-]+(\:\d+)*", + r"(\w+\.*\-*)*\w+:\d+\/v2/models\/[a-zA-Z0-9._-]+(\:\d+)*", ).fullmatch(model): inference_adapter = OVMSAdapter(model) else: From 04763a22db4fa11d12cef9b1400859480ba38ec7 Mon Sep 17 00:00:00 2001 From: Vladisalv Sovrasov Date: Tue, 17 Dec 2024 01:38:07 +0900 Subject: [PATCH 5/6] Update OVMS launch configs --- .github/workflows/test_precommit.yml | 2 +- examples/python/serving_api/README.md | 2 +- examples/python/serving_api/run.py | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/workflows/test_precommit.yml b/.github/workflows/test_precommit.yml index 0a042a9b..b1509ba1 100644 --- a/.github/workflows/test_precommit.yml +++ b/.github/workflows/test_precommit.yml @@ -146,6 +146,6 @@ jobs: python -m pip install --upgrade pip python -m pip install model_api/python/[ovms,tests] python -c "from model_api.models import DetectionModel; DetectionModel.create_model('ssd_mobilenet_v1_fpn_coco').save('ovms_models/ssd_mobilenet_v1_fpn_coco/1/ssd_mobilenet_v1_fpn_coco.xml')" - docker run -d --rm -v $GITHUB_WORKSPACE/ovms_models/:/models -p 9000:9000 -p 8000:8000 openvino/model_server:latest --model_path /models/ssd_mobilenet_v1_fpn_coco/ --model_name ssd_mobilenet_v1_fpn_coco --port 9000 --rest_port 8000 --log_level DEBUG --target_device CPU + docker run -d --rm -v $GITHUB_WORKSPACE/ovms_models/:/models -p 8000:8000 openvino/model_server:latest --model_path /models/ssd_mobilenet_v1_fpn_coco/ --model_name ssd_mobilenet_v1_fpn_coco --rest_port 8000 --log_level DEBUG --target_device CPU python tests/cpp/precommit/prepare_data.py -d data -p tests/cpp/precommit/public_scope.json python examples/python/serving_api/run.py data/coco128/images/train2017/000000000009.jpg # detects 4 objects diff --git a/examples/python/serving_api/README.md b/examples/python/serving_api/README.md index 3167a6ce..3049b397 100644 --- a/examples/python/serving_api/README.md +++ b/examples/python/serving_api/README.md @@ -28,7 +28,7 @@ This example demonstrates how to use a Python API of OpenVINO Model API for a re - Run docker with OVMS server: ```bash - docker run -d -v /home/user/models:/models -p 9000:9000 openvino/model_server:latest --model_path /models/ssd_mobilenet_v1_fpn_coco --model_name ssd_mobilenet_v1_fpn_coco --port 9000 --shape auto --nireq 4 --target_device CPU --plugin_config "{\"CPU_THROUGHPUT_STREAMS\": \"CPU_THROUGHPUT_AUTO\"}" + docker run -d -v /home/user/models:/models -p 8000:8000 openvino/model_server:latest --model_path /models/ssd_mobilenet_v1_fpn_coco --model_name ssd_mobilenet_v1_fpn_coco --rest_port 8000 --nireq 4 --target_device CPU ``` ## Run example diff --git a/examples/python/serving_api/run.py b/examples/python/serving_api/run.py index 9f1850f7..3c28b178 100755 --- a/examples/python/serving_api/run.py +++ b/examples/python/serving_api/run.py @@ -20,7 +20,7 @@ def main(): # Create Object Detection model specifying the OVMS server URL model = DetectionModel.create_model( - "localhost:9000/models/ssd_mobilenet_v1_fpn_coco", model_type="ssd" + "localhost:8000/v2/models/ssd_mobilenet_v1_fpn_coco", model_type="ssd" ) detections = model(image) print(f"Detection results: {detections}") From ec9b5347ab31407d29d7bd2f0d7ba6ef26ad17c8 Mon Sep 17 00:00:00 2001 From: Vladisalv Sovrasov Date: Tue, 17 Dec 2024 18:28:27 +0900 Subject: [PATCH 6/6] Update docs --- .../python/model_api/adapters/ovms_adapter.py | 54 +++++++++++++++---- 1 file changed, 44 insertions(+), 10 deletions(-) diff --git a/model_api/python/model_api/adapters/ovms_adapter.py b/model_api/python/model_api/adapters/ovms_adapter.py index 07d07509..603552a0 100644 --- a/model_api/python/model_api/adapters/ovms_adapter.py +++ b/model_api/python/model_api/adapters/ovms_adapter.py @@ -15,10 +15,15 @@ class OVMSAdapter(InferenceAdapter): - """Class that allows working with models served by the OpenVINO Model Server""" + """Inference adapter that allows working with models served by the OpenVINO Model Server""" def __init__(self, target_model: str): - """Expected format:
:/models/[:]""" + """ + Initializes OVMS adapter. + + Args: + target_model (str): Model URL. Expected format:
:/v2/models/[:] + """ import tritonclient.http as httpclient service_url, self.model_name, self.model_version = _parse_model_arg( @@ -35,7 +40,13 @@ def __init__(self, target_model: str): ) self.inputs = self.get_input_layers() - def get_input_layers(self): + def get_input_layers(self) -> dict[str, Metadata]: + """ + Retrieves information about remote model's inputs. + + Returns: + dict[str, Metadata]: metadata for each input. + """ return { meta["name"]: Metadata( {meta["name"]}, @@ -46,7 +57,13 @@ def get_input_layers(self): for meta in self.metadata["inputs"] } - def get_output_layers(self): + def get_output_layers(self) -> dict[str, Metadata]: + """ + Retrieves information about remote model's outputs. + + Returns: + dict[str, Metadata]: metadata for each output. + """ return { meta["name"]: Metadata( {meta["name"]}, @@ -56,7 +73,16 @@ def get_output_layers(self): for meta in self.metadata["outputs"] } - def infer_sync(self, dict_data): + def infer_sync(self, dict_data: dict) -> dict: + """ + Performs the synchronous model inference. The infer is a blocking method. + + Args: + dict_data (dict): data for each input layer. + + Returns: + dict: model raw outputs. + """ inputs = _prepare_inputs(dict_data, self.inputs) raw_result = self.client.infer( model_name=self.model_name, @@ -70,7 +96,8 @@ def infer_sync(self, dict_data): return inference_results - def infer_async(self, dict_data, callback_data): + def infer_async(self, dict_data: dict, callback_data: Any): + """A stub method imitating async inference with a blocking call.""" inputs = _prepare_inputs(dict_data, self.inputs) raw_result = self.client.infer( model_name=self.model_name, @@ -120,17 +147,22 @@ def embed_preprocessing( ): pass - def reshape_model(self, new_shape): - raise NotImplementedError + def reshape_model(self, new_shape: dict): + """OVMS adapter can not modify the remote model. This method raises an exception.""" + msg = "OVMSAdapter does not support model reshaping" + raise NotImplementedError(msg) - def get_rt_info(self, path): + def get_rt_info(self, path: list[str]) -> Any: + """Returns an attribute stored in model info.""" return get_rt_info_from_dict(self.metadata["rt_info"], path) def update_model_info(self, model_info: dict[str, Any]): + """OVMS adapter can not update the source model info. This method raises an exception.""" msg = "OVMSAdapter does not support updating model info" raise NotImplementedError(msg) def save_model(self, path: str, weights_path: str | None = None, version: str | None = None): + """OVMS adapter can not retrieve the source model. This method raises an exception.""" msg = "OVMSAdapter does not support saving a model" raise NotImplementedError(msg) @@ -150,6 +182,7 @@ def save_model(self, path: str, weights_path: str | None = None, version: str | def _parse_model_arg(target_model: str): + """Parses OVMS model URL.""" if not isinstance(target_model, str): msg = "target_model must be str" raise TypeError(msg) @@ -171,7 +204,8 @@ def _parse_model_arg(target_model: str): raise ValueError(msg) -def _prepare_inputs(dict_data, inputs_meta): +def _prepare_inputs(dict_data: dict, inputs_meta: dict[str, Metadata]): + """Converts raw model inputs into OVMS-specific representation.""" import tritonclient.http as httpclient inputs = []