triton-inference-server
diff --git a/‎Dockerfile.sdk‎
Lines changed: 38 additions & 10 deletions b/‎Dockerfile.sdk‎
Lines changed: 38 additions & 10 deletions
diff --git a/‎qa/L0_backend_python/argument_validation/test.sh‎
Lines changed: 1 addition & 1 deletion b/‎qa/L0_backend_python/argument_validation/test.sh‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎qa/L0_backend_python/bls/test.sh‎
Lines changed: 23 additions & 18 deletions b/‎qa/L0_backend_python/bls/test.sh‎
Lines changed: 23 additions & 18 deletions
diff --git a/‎qa/L0_backend_python/custom_metrics/test.sh‎
Lines changed: 1 addition & 1 deletion b/‎qa/L0_backend_python/custom_metrics/test.sh‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎qa/L0_backend_python/python_test.py‎
Lines changed: 30 additions & 0 deletions b/‎qa/L0_backend_python/python_test.py‎
Lines changed: 30 additions & 0 deletions
diff --git a/‎qa/L0_backend_python/request_rescheduling/test.sh‎
Lines changed: 1 addition & 1 deletion b/‎qa/L0_backend_python/request_rescheduling/test.sh‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎qa/L0_backend_python/setup_python_enviroment.sh‎
Lines changed: 1 addition & 1 deletion b/‎qa/L0_backend_python/setup_python_enviroment.sh‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎qa/L0_backend_python/test.sh‎
Lines changed: 29 additions & 3 deletions b/‎qa/L0_backend_python/test.sh‎
Lines changed: 29 additions & 3 deletions
diff --git a/‎qa/L0_backend_python/python_unittest.py‎ renamed to ‎qa/L0_backend_python/test_infer_shm_leak.py‎
Lines changed: 18 additions & 13 deletions b/‎qa/L0_backend_python/python_unittest.py‎ renamed to ‎qa/L0_backend_python/test_infer_shm_leak.py‎
Lines changed: 18 additions & 13 deletions
diff --git a/‎qa/L0_dlpack_multi_gpu/test.sh‎
Lines changed: 3 additions & 3 deletions b/‎qa/L0_dlpack_multi_gpu/test.sh‎
Lines changed: 3 additions & 3 deletions
@@ -32,8 +32,10 @@
 ARG BASE_IMAGE=nvcr.io/nvidia/tritonserver:24.07-py3-min
 
 ARG TRITON_CLIENT_REPO_SUBDIR=clientrepo
+ARG TRITON_PA_REPO_SUBDIR=perfanalyzerrepo
 ARG TRITON_COMMON_REPO_TAG=main
 ARG TRITON_CORE_REPO_TAG=main
+ARG TRITON_CLIENT_REPO_TAG=main
 ARG TRITON_THIRD_PARTY_REPO_TAG=main
 ARG TRITON_MODEL_ANALYZER_REPO_TAG=main
 ARG TRITON_ENABLE_GPU=ON
@@ -103,8 +105,10 @@ RUN rm -f /usr/bin/python && \
 # Build the client library and examples
 ARG TRITON_REPO_ORGANIZATION
 ARG TRITON_CLIENT_REPO_SUBDIR
+ARG TRITON_PA_REPO_SUBDIR
 ARG TRITON_COMMON_REPO_TAG
 ARG TRITON_CORE_REPO_TAG
+ARG TRITON_CLIENT_REPO_TAG
 ARG TRITON_THIRD_PARTY_REPO_TAG
 ARG TRITON_ENABLE_GPU
 ARG JAVA_BINDINGS_MAVEN_VERSION
@@ -114,26 +118,53 @@ ARG TARGETPLATFORM
 WORKDIR /workspace
 COPY TRITON_VERSION .
 COPY ${TRITON_CLIENT_REPO_SUBDIR} client
+COPY ${TRITON_PA_REPO_SUBDIR} perf_analyzer
 
-WORKDIR /workspace/build
+WORKDIR /workspace/client_build
 RUN cmake -DCMAKE_INSTALL_PREFIX=/workspace/install \
           -DTRITON_VERSION=`cat /workspace/TRITON_VERSION` \
           -DTRITON_REPO_ORGANIZATION=${TRITON_REPO_ORGANIZATION} \
           -DTRITON_COMMON_REPO_TAG=${TRITON_COMMON_REPO_TAG} \
           -DTRITON_CORE_REPO_TAG=${TRITON_CORE_REPO_TAG} \
           -DTRITON_THIRD_PARTY_REPO_TAG=${TRITON_THIRD_PARTY_REPO_TAG} \
+          -DTRITON_ENABLE_PERF_ANALYZER=OFF \
           -DTRITON_ENABLE_CC_HTTP=ON -DTRITON_ENABLE_CC_GRPC=ON \
-          -DTRITON_ENABLE_PYTHON_HTTP=ON -DTRITON_ENABLE_PYTHON_GRPC=ON \
+          -DTRITON_ENABLE_PYTHON_HTTP=OFF -DTRITON_ENABLE_PYTHON_GRPC=OFF \
           -DTRITON_ENABLE_JAVA_HTTP=ON \
-          -DTRITON_ENABLE_PERF_ANALYZER=ON \
+          -DTRITON_ENABLE_EXAMPLES=ON -DTRITON_ENABLE_TESTS=ON \
+          -DTRITON_ENABLE_GPU=${TRITON_ENABLE_GPU} /workspace/client
+RUN make -j16 cc-clients java-clients && \
+    rm -fr ~/.m2
+
+# TODO: PA will rebuild the CC clients since it depends on it.
+# This should be optimized so that we do not have to build
+# the CC clients twice. Similarly, because the SDK expectation is
+# that PA is packaged with the python client, we hold off on building
+# the python client until now. Post-migration we should focus
+# effort on de-tangling these flows.
+WORKDIR /workspace/pa_build
+RUN cmake -DCMAKE_INSTALL_PREFIX=/workspace/install \
+          -DTRITON_VERSION=`cat /workspace/TRITON_VERSION` \
+          -DTRITON_REPO_ORGANIZATION=${TRITON_REPO_ORGANIZATION} \
+          -DTRITON_COMMON_REPO_TAG=${TRITON_COMMON_REPO_TAG} \
+          -DTRITON_CORE_REPO_TAG=${TRITON_CORE_REPO_TAG} \
+          -DTRITON_CLIENT_REPO_TAG=${TRITON_CLIENT_REPO_TAG} \
           -DTRITON_ENABLE_PERF_ANALYZER_C_API=ON \
           -DTRITON_ENABLE_PERF_ANALYZER_TFS=ON \
           -DTRITON_ENABLE_PERF_ANALYZER_TS=ON \
           -DTRITON_ENABLE_PERF_ANALYZER_OPENAI=ON \
-          -DTRITON_ENABLE_EXAMPLES=ON -DTRITON_ENABLE_TESTS=ON \
-          -DTRITON_ENABLE_GPU=${TRITON_ENABLE_GPU} /workspace/client
-RUN make -j16 cc-clients python-clients java-clients && \
-    rm -fr ~/.m2
+          -DTRITON_ENABLE_CC_HTTP=ON \
+          -DTRITON_ENABLE_CC_GRPC=ON \
+          -DTRITON_ENABLE_PYTHON_HTTP=ON \
+          -DTRITON_ENABLE_PYTHON_GRPC=ON \
+          -DTRITON_PACKAGE_PERF_ANALYZER=ON \
+          -DTRITON_ENABLE_GPU=${TRITON_ENABLE_GPU} \
+          /workspace/perf_analyzer
+RUN make -j16 perf-analyzer python-clients
+
+RUN pip3 install build \
+    && cd /workspace/perf_analyzer/genai-perf \
+    && python3 -m build --wheel --outdir /workspace/install/python
 
 # Install Java API Bindings
 RUN if [ "$TARGETPLATFORM" = "linux/amd64" ]; then \
@@ -144,9 +175,6 @@ RUN if [ "$TARGETPLATFORM" = "linux/amd64" ]; then \
         --jar-install-path /workspace/install/java-api-bindings; \
     fi
 
-RUN pip3 install build \
-    && cd /workspace/client/src/c++/perf_analyzer/genai-perf \
-    && python3 -m build --wheel --outdir /workspace/install/python
 ############################################################################
 ## Create sdk container
 ############################################################################
 
@@ -25,7 +25,7 @@
 # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
-CLIENT_PY=../python_unittest.py
+CLIENT_PY=../test_infer_shm_leak.py
 CLIENT_LOG="./arg_validation_client.log"
 TEST_RESULT_FILE='test_results.txt'
 SERVER_ARGS="--model-repository=${MODELDIR}/argument_validation/models --backend-directory=${BACKEND_DIR} --log-verbose=1"
 
@@ -25,15 +25,15 @@
 # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
-CLIENT_PY=../python_unittest.py
+CLIENT_PY=../test_infer_shm_leak.py
 CLIENT_LOG="./bls_client.log"
 TEST_RESULT_FILE='test_results.txt'
 source ../../common/util.sh
 
 TRITON_REPO_ORGANIZATION=${TRITON_REPO_ORGANIZATION:=http://github.com/triton-inference-server}
 
 RET=0
-rm -fr *.log ./models *.txt
+rm -fr *.log ./models *.txt *.xml
 
 # FIXME: [DLIS-5970] Until Windows supports GPU tensors, only test CPU
 if [[ ${TEST_WINDOWS} == 0 ]]; then
@@ -119,30 +119,35 @@ if [[ ${TEST_WINDOWS} == 0 ]]; then
 
             for MODEL_NAME in bls bls_memory bls_memory_async bls_async; do
                 export MODEL_NAME=${MODEL_NAME}
-
-                python3 -m pytest --junitxml="${MODEL_NAME}.${TRIAL}.${CUDA_MEMORY_POOL_SIZE_MB}.report.xml" $CLIENT_PY >> $CLIENT_LOG 2>&1
-                if [ $? -ne 0 ]; then
+                # Run with pytest to capture the return code correctly
+                pytest --junitxml="${MODEL_NAME}.${TRIAL}.${CUDA_MEMORY_POOL_SIZE_MB}.report.xml" $CLIENT_PY >> $CLIENT_LOG 2>&1
+                EXIT_CODE=$?
+                if [ $EXIT_CODE -ne 0 ]; then
                     echo -e "\n***\n*** ${MODEL_NAME} ${BLS_KIND} test FAILED. \n***"
+                    RET=$EXIT_CODE
                     cat $SERVER_LOG
                     cat $CLIENT_LOG
-                    RET=1
                 fi
             done
 
-            set -e
-
             kill_server
 
-            # Check for bls 'test_timeout' to ensure timeout value is being correctly passed
-            if [ `grep -c "Request timeout: 11000000000" $SERVER_LOG` == "0" ]; then
-                echo -e "\n***\n*** BLS timeout value not correctly passed to model: line ${LINENO}\n***"
-                cat $SERVER_LOG
-                RET=1
+            set -e
+
+            # Only check the timeout value if there is no error since the test
+            # may fail before the test_timeout case gets run.
+            if [ $RET -eq 0 ]; then
+                # Check for bls 'test_timeout' to ensure timeout value is being correctly passed
+                if [ `grep -c "Request timeout: 11000000000" $SERVER_LOG` == "0" ]; then
+                    echo -e "\n***\n*** BLS timeout value not correctly passed to model: line ${LINENO}\n***"
+                    cat $SERVER_LOG
+                    RET=1
+                fi
             fi
 
-            if [[ $CUDA_MEMORY_POOL_SIZE_MB -eq 128 ]]; then
+            if [[ $CUDA_MEMORY_POOL_SIZE_MB -eq 256 ]]; then
                 if [ `grep -c "Failed to allocate memory from CUDA memory pool" $SERVER_LOG` != "0" ]; then
-                    echo -e "\n***\n*** Expected to use CUDA memory pool for all tests when CUDA_MEMOY_POOL_SIZE_MB is 128 MB for 'bls' $BLS_KIND test\n***"
+                    echo -e "\n***\n*** Expected to use CUDA memory pool for all tests when CUDA_MEMORY_POOL_SIZE_MB is 256 MB for 'bls' $BLS_KIND test\n***"
                     cat $SERVER_LOG
                     RET=1
                 fi
@@ -342,10 +347,10 @@ set -e
 
 kill_server
 
-if [ $RET -eq 1 ]; then
-    echo -e "\n***\n*** BLS test FAILED. \n***"
-else
+if [ $RET -eq 0 ]; then
     echo -e "\n***\n*** BLS test PASSED. \n***"
+else
+    echo -e "\n***\n*** BLS test FAILED. \n***"
 fi
 
 exit $RET
@@ -25,7 +25,7 @@
 # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
-CLIENT_PY=../python_unittest.py
+CLIENT_PY=../test_infer_shm_leak.py
 CLIENT_LOG="./custom_metrics_client.log"
 TEST_RESULT_FILE='test_results.txt'
 source ../../common/util.sh
 
@@ -365,6 +365,36 @@ def test_bool(self):
                 self.assertIsNotNone(output0)
                 self.assertTrue(np.all(output0 == input_data))
 
+    def test_bf16(self):
+        model_name = "identity_bf16"
+        shape = [2, 2]
+        with self._shm_leak_detector.Probe() as shm_probe:
+            with httpclient.InferenceServerClient(
+                f"{_tritonserver_ipaddr}:8000"
+            ) as client:
+                # NOTE: Client will truncate FP32 to BF16 internally
+                # since numpy has no built-in BF16 representation.
+                np_input = np.ones(shape, dtype=np.float32)
+                inputs = [
+                    httpclient.InferInput(
+                        "INPUT0", np_input.shape, "BF16"
+                    ).set_data_from_numpy(np_input)
+                ]
+                result = client.infer(model_name, inputs)
+
+                # Assert that Triton correctly returned a BF16 tensor.
+                response = result.get_response()
+                triton_output = response["outputs"][0]
+                triton_dtype = triton_output["datatype"]
+                self.assertEqual(triton_dtype, "BF16")
+
+                np_output = result.as_numpy("OUTPUT0")
+                self.assertIsNotNone(np_output)
+                # BF16 tensors are held in FP32 when converted to numpy due to
+                # lack of native BF16 support in numpy, so verify that.
+                self.assertEqual(np_output.dtype, np.float32)
+                self.assertTrue(np.allclose(np_output, np_input))
+
     def test_infer_pytorch(self):
         # FIXME: This model requires torch. Because windows tests are not run in a docker
         # environment with torch installed, we need to think about how we want to install
 
@@ -25,7 +25,7 @@
 # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
-CLIENT_PY="../python_unittest.py"
+CLIENT_PY="../test_infer_shm_leak.py"
 CLIENT_LOG="./request_rescheduling_client.log"
 TEST_RESULT_FILE='test_results.txt'
 source ../../common/util.sh
 
@@ -151,7 +151,7 @@ apt-get update && apt-get -y install \
                             libboost-dev
 rm -f /usr/bin/python3 && \
 ln -s "/usr/bin/python3.${PYTHON_ENV_VERSION}" /usr/bin/python3
-pip3 install --upgrade install requests numpy virtualenv protobuf
+pip3 install --upgrade requests numpy virtualenv protobuf
 find /opt/tritonserver/qa/pkgs/ -maxdepth 1 -type f -name \
     "tritonclient-*linux*.whl" | xargs printf -- '%s[all]' | \
     xargs pip3 install --upgrade
 
@@ -95,6 +95,9 @@ fi
 mkdir -p models/identity_fp32/1/
 cp ../python_models/identity_fp32/model.py ./models/identity_fp32/1/model.py
 cp ../python_models/identity_fp32/config.pbtxt ./models/identity_fp32/config.pbtxt
+mkdir -p models/identity_bf16/1/
+cp ../python_models/identity_bf16/model.py ./models/identity_bf16/1/model.py
+cp ../python_models/identity_bf16/config.pbtxt ./models/identity_bf16/config.pbtxt
 RET=0
 
 cp -r ./models/identity_fp32 ./models/identity_uint8
@@ -422,11 +425,20 @@ if [ "$TEST_JETSON" == "0" ]; then
         # between dependencies.
         setup_virtualenv
 
+        set +e
         (cd ${TEST} && bash -ex test.sh)
-        if [ $? -ne 0 ]; then
+        EXIT_CODE=$?
+        if [ $EXIT_CODE -ne 0 ]; then
             echo "Subtest ${TEST} FAILED"
-            RET=1
+            RET=$EXIT_CODE
+
+            # In bls test, it is allowed to fail with a strict memory leak of 480 bytes with exit code '123'.
+            # Propagate the exit code to make sure it's not overwritten by other tests.
+            if [[ ${TEST} == "bls" ]]  && [[ $EXIT_CODE -ne 1 ]] ; then
+                BLS_RET=$RET
+            fi
         fi
+        set -e
 
         deactivate_virtualenv
     done
@@ -435,11 +447,13 @@ if [ "$TEST_JETSON" == "0" ]; then
     if [[ ${PYTHON_ENV_VERSION} = "10" ]] && [[ ${TEST_WINDOWS} == 0 ]]; then
         # In 'env' test we use miniconda for dependency management. No need to run
         # the test in a virtual environment.
+        set +e
         (cd env && bash -ex test.sh)
         if [ $? -ne 0 ]; then
             echo "Subtest env FAILED"
             RET=1
         fi
+        set -e
     fi
 fi
 
@@ -456,12 +470,14 @@ for TEST in ${SUBTESTS}; do
     # between dependencies.
     setup_virtualenv
 
+    set +e
     (cd ${TEST} && bash -ex test.sh)
 
     if [ $? -ne 0 ]; then
         echo "Subtest ${TEST} FAILED"
         RET=1
     fi
+    set -e
 
     deactivate_virtualenv
 done
@@ -472,4 +488,14 @@ else
   echo -e "\n***\n*** Test FAILED\n***"
 fi
 
-exit $RET
+# Exit with RET if it is 1, meaning that the test failed.
+# Otherwise, exit with BLS_RET if it is set, meaning that the known memory leak is captured.
+if [ $RET -eq 1 ]; then
+    exit $RET
+else
+    if [ -z "$BLS_RET" ]; then
+        exit $RET
+    else
+        exit $BLS_RET
+    fi
+fi
@@ -33,6 +33,7 @@
 import os
 import unittest
 
+import pytest
 import shm_util
 import tritonclient.grpc as grpcclient
 from tritonclient.utils import *
@@ -41,11 +42,13 @@
 # we overwrite the IP address with the TRITONSERVER_IPADDR envvar
 _tritonserver_ipaddr = os.environ.get("TRITONSERVER_IPADDR", "localhost")
 
+# The exit code 123 is used to indicate that the shm leak probe detected a 480
+# bytes leak in the bls sub-test. Any leak other than 480 bytes will cause the
+# test to fail with the default exit code 1.
+ALLOWED_FAILURE_EXIT_CODE = 123
 
-class PythonUnittest(unittest.TestCase):
-    def setUp(self):
-        self._shm_leak_detector = shm_util.ShmLeakDetector()
 
+class TestInferShmLeak:
     def _run_unittest(self, model_name):
         with grpcclient.InferenceServerClient(f"{_tritonserver_ipaddr}:8001") as client:
             # No input is required
@@ -54,15 +57,17 @@ def _run_unittest(self, model_name):
 
             # The model returns 1 if the tests were successfully passed.
             # Otherwise, it will return 0.
-            self.assertEqual(
-                output0, [1], f"python_unittest failed for model {model_name}"
-            )
-
-    def test_python_unittest(self):
-        model_name = os.environ["MODEL_NAME"]
-        with self._shm_leak_detector.Probe() as shm_probe:
-            self._run_unittest(model_name)
+            assert output0 == [1], f"python_unittest failed for model {model_name}"
 
+    def test_shm_leak(self):
+        self._shm_leak_detector = shm_util.ShmLeakDetector()
+        model_name = os.environ.get("MODEL_NAME", "default_model")
 
-if __name__ == "__main__":
-    unittest.main()
+        try:
+            with self._shm_leak_detector.Probe() as shm_probe:
+                self._run_unittest(model_name)
+        except AssertionError as e:
+            if "Known shared memory leak of 480 bytes detected" in str(e):
+                pytest.exit(str(e), returncode=ALLOWED_FAILURE_EXIT_CODE)
+            else:
+                raise e
@@ -27,7 +27,7 @@
 
 SERVER=/opt/tritonserver/bin/tritonserver
 SERVER_ARGS="--model-repository=`pwd`/models --log-verbose=1"
-CLIENT_PY=./python_unittest.py
+CLIENT_PY=./test_infer_shm_leak.py
 CLIENT_LOG="./client.log"
 EXPECTED_NUM_TESTS="1"
 TEST_RESULT_FILE='test_results.txt'
@@ -52,8 +52,8 @@ rm -fr *.log ./models
 mkdir -p models/dlpack_test/1/
 cp ../python_models/dlpack_test/model.py models/dlpack_test/1/
 cp ../python_models/dlpack_test/config.pbtxt models/dlpack_test
-cp ../L0_backend_python/python_unittest.py .
-sed -i 's#sys.path.append("../../common")#sys.path.append("../common")#g' python_unittest.py
+cp ../L0_backend_python/test_infer_shm_leak.py .
+sed -i 's#sys.path.append("../../common")#sys.path.append("../common")#g' test_infer_shm_leak.py
 
 run_server
 if [ "$SERVER_PID" == "0" ]; then