triton-inference-server · piotrm-nvidia · Jun 11, 2024 · Jun 11, 2024 · Jun 17, 2024 · Jun 17, 2024
diff --git a/src/python/library/tests/test_client.py b/src/python/library/tests/test_client.py
@@ -31,7 +31,7 @@
 
 class TestClient(unittest.TestCase):
     def test_client(self):
-        Client()
+        Client("localhost:8000")
 
 
 if __name__ == "__main__":

diff --git a/src/python/library/tritonclient/_client.py b/src/python/library/tritonclient/_client.py
@@ -25,6 +25,9 @@
 # OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+from typing import Union
+
+from tritonclient.hl import DecoupledModelClient, ModelClient
 from tritonclient.utils import raise_error
 
 
@@ -85,6 +88,32 @@ def unregister_plugin(self):
         self._plugin = None
 
 
-class Client(InferenceServerClientBase):
-    def __init__(self) -> None:
+# # Change url to 'http://localhost:8000' for utilizing HTTP client
+# client = Client(url='grpc://loacalhost:8001')
+#
+# input_tensor_as_numpy = np.array(...)
+#
+# # Infer should be async similar to the existing Python APIs
+# responses = client.model('simple').infer(inputs={'input': input_tensor_as_numpy})
+#
+# for response in responses:
+# 	numpy_array = np.asarray(response.outputs['output'])
+#
+# client.close()
+
+
+class Client:
+    def __init__(self, url: str) -> None:
+        self._client_url = url
         super().__init__()
+
+    def model(self, name: str) -> Union[ModelClient, DecoupledModelClient]:
+        client = ModelClient(url=self._client_url, model_name=name)
+        if client.model_config.decoupled:
+            try:
+                decoupled_client = DecoupledModelClient.from_existing_client(client)
+            finally:
+                client.close()
+            return decoupled_client
+        else:
+            return client
diff --git a/src/python/library/tritonclient/hl/README.md b/src/python/library/tritonclient/hl/README.md
@@ -0,0 +1,163 @@
+
+## Dependencies
+
+Just for test install PyTriton client:
+
+```bash
+pip install nvidia-pytriton
+```
+
+## Non-decoupled PyTriton client
+
+It is possible to test new client using PyTriton server:
+
+```python
+import time
+import numpy as np
+from pytriton.model_config import ModelConfig, Tensor
+from pytriton.triton import Triton, TritonConfig
+from pytriton.decorators import batch
+
+@batch
+def identity(input):
+    return {"output": input}
+
+
+triton = Triton()
+triton.bind(
+    model_name="identity",
+    infer_func=identity,
+    inputs=[Tensor(name="input", dtype=np.bytes_, shape=(1,))],
+    outputs=[Tensor(name="output", dtype=np.bytes_, shape=(1,))],
+    strict=False,
+)
+triton.run()
+```
+
+
+You can test new client with simple request:
+
+<!--pytest-codeblocks:cont-->
+```python
+import numpy as np
+from tritonclient._client import Client
+
+client = Client("localhost:8000").model("identity")
+
+result = client.infer(inputs={"input": np.char.encode([["a"]], "utf-8")})
+```
+
+<!--pytest-codeblocks:cont-->
+<!--
+```python
+client.close()
+triton.stop()
+
+# Sleep for a while to let the server run
+import time
+time.sleep(40)
+
+assert "output" in result
+```
+-->
+
+
+Expected output:
+
+<!--pytest.mark.skip-->
+```python
+{'output': array(['a'], dtype='<U1')}
+```
+
+## Decoupled PyTriton client
+
+```python
+from pytriton.decorators import batch
+import time
+import numpy as np
+
+# Decorate your model function with `@batch`. This allows Triton to batch multiple requests together.
+@batch
+def _infer_fn(input):
+    for _ in range(3):
+        time.sleep(2.0)
+        yield {"output": input}
+
+# Create a Triton model configuration and bind it to the model function `_infer_fn`.
+from pytriton.model_config import ModelConfig, Tensor
+from pytriton.triton import Triton, TritonConfig
+```
+
+
+Bind Triton:
+
+<!--pytest-codeblocks:cont-->
+```python
+triton = Triton()
+triton.bind(
+    model_name="decoupled_identity",
+    infer_func=_infer_fn,
+    inputs=[
+        Tensor(name="input", dtype=np.int32, shape=(-1,)),
+        # Shape with a batch dimension (-1) to support variable-sized batches.
+    ],
+    outputs=[
+        Tensor(name="output", dtype=np.int32, shape=(-1,)),
+        # Output shape with a batch dimension (-1).
+    ],
+    config=ModelConfig(decoupled=True),
+)
+```
+
+Start Triton:
+
+<!--pytest-codeblocks:cont-->
+```python
+triton.run()
+```
+
+<!--pytest-codeblocks:cont-->
+<!--
+```python
+# Let the server run for a while
+import time
+time.sleep(40)
+```
+-->
+
+User client for itegration over decoupled results:
+
+<!--pytest-codeblocks:cont-->
+```python
+import numpy as np
+from tritonclient._client import Client
+
+client = Client("localhost:8000").model("decoupled_identity")
+
+results = []
+
+# Test fails with 500 error
+
+for result in client.infer(inputs={"input": np.array([1], dtype=np.int32)}):
+    print(result)
+    results.append(result)
+```
+
+<!--pytest-codeblocks:cont-->
+<!--
+```python
+client.close()
+triton.stop()
+
+assert "output" in results[0]
+```
+-->
+
+Expected output:
+
+<!--pytest.mark.skip-->
+```python
+{'output': array([0.001])}
+{'output': array([0.001])}
+{'output': array([0.001])}
+```
diff --git a/src/python/library/tritonclient/hl/__init__.py b/src/python/library/tritonclient/hl/__init__.py
@@ -0,0 +1,17 @@
+# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# noqa: D104
+
+from .client import DecoupledModelClient  # noqa: F401
+from .client import ModelClient  # noqa: F401