triton-inference-server · kthui · Dec 11, 2024 · Dec 11, 2024 · Dec 11, 2024
diff --git a/qa/L0_backend_python/io/io_test.py b/qa/L0_backend_python/io/io_test.py
@@ -259,6 +259,33 @@ def test_requested_output_decoupled(self):
             self.assertTrue(np.allclose(gpu_output_data[1:], next_gpu_output_data))
         self.assertTrue(user_data._completed_requests.empty())
 
+    # Assert a prior crash is fixed regarding requested output on a decoupled model.
+    def test_requested_output_decoupled_prior_crash(self):
+        model_name = "llm"
+        prompt = "test"
+
+        text_input_data = np.array([[prompt]]).astype(object)
+        inputs = [grpcclient.InferInput("text_input", text_input_data.shape, "BYTES")]
+        inputs[-1].set_data_from_numpy(text_input_data)
+
+        requested_outputs = [grpcclient.InferRequestedOutput("text_output")]
+
+        user_data = UserData()
+        with grpcclient.InferenceServerClient(f"{_tritonserver_ipaddr}:8001") as client:
+            client.start_stream(callback=partial(callback, user_data))
+            client.async_stream_infer(
+                model_name=model_name, inputs=inputs, outputs=requested_outputs
+            )
+            client.stop_stream()
+
+        outputs = ""
+        while not user_data._completed_requests.empty():
+            result = user_data._completed_requests.get(block=False)
+            if isinstance(result, InferenceServerException):
+                raise result
+            outputs += str(result.as_numpy("text_output")[0], encoding="utf-8")
+        self.assertGreater(len(outputs), 0, "text_output is empty")
+
 
 if __name__ == "__main__":
     unittest.main()
diff --git a/qa/L0_backend_python/io/requested_output_model/config.pbtxt b/qa/L0_backend_python/io/requested_output_model/config.pbtxt
@@ -0,0 +1,64 @@
+# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#  * Redistributions of source code must retain the above copyright
+#    notice, this list of conditions and the following disclaimer.
+#  * Redistributions in binary form must reproduce the above copyright
+#    notice, this list of conditions and the following disclaimer in the
+#    documentation and/or other materials provided with the distribution.
+#  * Neither the name of NVIDIA CORPORATION nor the names of its
+#    contributors may be used to endorse or promote products derived
+#    from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+#
+# This test case was added based on a prior crash. DO NOT MODIFY!
+#
+
+name: "llm"
+backend: "python"
+max_batch_size: 128
+
+model_transaction_policy {
+  decoupled: True
+}
+
+input [
+  {
+    name: "text_input"
+    data_type: TYPE_STRING
+    dims: [ 1 ]
+  }
+]
+output [
+  {
+    name: "text_output"
+    data_type: TYPE_STRING
+    dims: [ -1 ]
+  },
+  {
+    name: "sequence_index"
+    data_type: TYPE_INT32
+    dims: [ 1 ]
+  }
+]
+
+instance_group [
+  {
+    count: 1
+    kind : KIND_CPU
+  }
+]
diff --git a/qa/L0_backend_python/io/requested_output_model/model.py b/qa/L0_backend_python/io/requested_output_model/model.py
@@ -0,0 +1,122 @@
+# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#  * Redistributions of source code must retain the above copyright
+#    notice, this list of conditions and the following disclaimer.
+#  * Redistributions in binary form must reproduce the above copyright
+#    notice, this list of conditions and the following disclaimer in the
+#    documentation and/or other materials provided with the distribution.
+#  * Neither the name of NVIDIA CORPORATION nor the names of its
+#    contributors may be used to endorse or promote products derived
+#    from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+#
+# This test case was added based on a prior crash. DO NOT MODIFY!
+#
+
+import json
+import traceback
+
+import numpy as np
+import triton_python_backend_utils as pb_utils
+
+
+def get_valid_param_value(param, default_value=""):
+    value = param.get("string_value", "")
+    return default_value if value.startswith("${") or value == "" else value
+
+
+class TritonPythonModel:
+    def initialize(self, args):
+        # Parse model configs
+        model_config = json.loads(args["model_config"])
+
+        self.output_config = pb_utils.get_output_config_by_name(
+            model_config, "text_output"
+        )
+
+        self.output_dtype = pb_utils.triton_string_to_numpy(
+            self.output_config["data_type"]
+        )
+
+        self.decoupled = pb_utils.using_decoupled_model_transaction_policy(model_config)
+
+        self.logger = pb_utils.Logger
+
+    def create_triton_tensors(self, index):
+        x = "bla" + str(index)
+        output = [x.encode("utf8")]
+        np_output = np.array(output).astype(self.output_dtype)
+        seq_idx = np.array([[0]]).astype(np.int32)
+
+        t1 = pb_utils.Tensor("text_output", np_output)
+        t2 = pb_utils.Tensor("sequence_index", seq_idx)
+        tensors = [t1, t2]
+        return tensors
+
+    def create_triton_response(self, index):
+        tensors = self.create_triton_tensors(index)
+        return pb_utils.InferenceResponse(output_tensors=tensors)
+
+    def execute(self, requests):
+        responses = []
+        for request in requests:
+            if self.decoupled:
+                response_sender = request.get_response_sender()
+            try:
+                for index in range(0, 1):
+                    triton_response = self.create_triton_response(index)
+                    # output_tensors = triton_response.output_tensors()
+                    if self.decoupled:
+                        response_sender.send(triton_response)
+                    else:
+                        responses.append(triton_response)
+
+                if self.decoupled:
+                    response_sender.send(
+                        flags=pb_utils.TRITONSERVER_RESPONSE_COMPLETE_FINAL
+                    )
+
+            except Exception:
+                # self.logger.log_info(f"error response")
+                self.logger.log_error(traceback.format_exc())
+                # If encountering an error, send a response with err msg
+                error_response = pb_utils.InferenceResponse(
+                    output_tensors=[],
+                    error=pb_utils.TritonError(traceback.format_exc()),
+                )
+
+                if self.decoupled:
+                    response_sender.send(error_response)
+                    response_sender.send(
+                        flags=pb_utils.TRITONSERVER_RESPONSE_COMPLETE_FINAL
+                    )
+                else:
+                    responses.append(error_response)
+
+        if self.decoupled:
+            return None
+        else:
+            assert len(responses) == len(requests)
+            return responses
+
+    def finalize(self):
+        """`finalize` is called only once when the model is being unloaded.
+        Implementing `finalize` function is optional. This function allows
+        the model to perform any necessary clean ups before exit.
+        """
+        print("Cleaning up...")
diff --git a/qa/L0_backend_python/io/test.sh b/qa/L0_backend_python/io/test.sh
@@ -176,6 +176,32 @@ done
 kill $SERVER_PID
 wait $SERVER_PID
 
+# IOTest.test_requested_output_decoupled_prior_crash
+rm -rf models && mkdir models
+mkdir -p models/llm/1/
+cp requested_output_model/config.pbtxt models/llm/
+cp requested_output_model/model.py models/llm/1/
+
+run_server
+if [ "$SERVER_PID" == "0" ]; then
+    echo -e "\n***\n*** Failed to start $SERVER\n***"
+    cat $SERVER_LOG
+    RET=1
+fi
+
+SUBTEST="test_requested_output_decoupled_prior_crash"
+set +e
+python3 -m pytest --junitxml=${SUBTEST}.report.xml ${UNITTEST_PY}::IOTest::${SUBTEST} > ${CLIENT_LOG}.${SUBTEST}
+if [ $? -ne 0 ]; then
+    echo -e "\n***\n*** IOTest.${SUBTEST} FAILED. \n***"
+    cat $CLIENT_LOG.${SUBTEST}
+    RET=1
+fi
+set -e
+
+kill $SERVER_PID
+wait $SERVER_PID
+
 if [ $RET -eq 0 ]; then
     echo -e "\n***\n*** IO test PASSED.\n***"
 else