triton-inference-server
diff --git a/‎build.py‎
Lines changed: 12 additions & 17 deletions b/‎build.py‎
Lines changed: 12 additions & 17 deletions
diff --git a/‎docs/protocol/extension_generate.md‎
Lines changed: 9 additions & 3 deletions b/‎docs/protocol/extension_generate.md‎
Lines changed: 9 additions & 3 deletions
diff --git a/‎docs/user_guide/metrics.md‎
Lines changed: 19 additions & 0 deletions b/‎docs/user_guide/metrics.md‎
Lines changed: 19 additions & 0 deletions
diff --git a/‎docs/user_guide/trace.md‎
Lines changed: 16 additions & 0 deletions b/‎docs/user_guide/trace.md‎
Lines changed: 16 additions & 0 deletions
diff --git a/‎qa/L0_backend_python/env/test.sh‎
Lines changed: 2 additions & 0 deletions b/‎qa/L0_backend_python/env/test.sh‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎qa/L0_backend_python/lifecycle/lifecycle_test.py‎
Lines changed: 34 additions & 1 deletion b/‎qa/L0_backend_python/lifecycle/lifecycle_test.py‎
Lines changed: 34 additions & 1 deletion
diff --git a/‎qa/L0_grpc_state_cleanup/cleanup_test.py‎
Lines changed: 23 additions & 9 deletions b/‎qa/L0_grpc_state_cleanup/cleanup_test.py‎
Lines changed: 23 additions & 9 deletions
diff --git a/‎qa/L0_http/generate_endpoint_test.py‎
Lines changed: 43 additions & 0 deletions b/‎qa/L0_http/generate_endpoint_test.py‎
Lines changed: 43 additions & 0 deletions
diff --git a/‎qa/L0_http/test.sh‎
Lines changed: 1 addition & 1 deletion b/‎qa/L0_http/test.sh‎
Lines changed: 1 addition & 1 deletion
@@ -76,7 +76,7 @@
         "2024.0.0",  # ORT OpenVINO
         "2024.0.0",  # Standalone OpenVINO
         "3.2.6",  # DCGM version
-        "0.4.3",  # vLLM version
+        "0.5.0.post1",  # vLLM version
     )
 }
 
@@ -1082,25 +1082,20 @@ def create_dockerfile_linux(
 """
     if "tensorrtllm" in backends:
         df += """
-
-RUN ldconfig
-# Remove contents that are not needed in runtime
-RUN ARCH="$(uname -i)" \\
-      && rm -fr ${TRT_ROOT}/bin ${TRT_ROOT}/targets/${ARCH}-linux-gnu/bin ${TRT_ROOT}/data \\
-      && rm -fr  ${TRT_ROOT}/doc ${TRT_ROOT}/onnx_graphsurgeon ${TRT_ROOT}/python \\
-      && rm -fr ${TRT_ROOT}/samples  ${TRT_ROOT}/targets/${ARCH}-linux-gnu/samples
-
 # Install required packages for TRT-LLM models
-RUN python3 -m pip install --upgrade pip \\
-      && pip3 install transformers
-
-# ldconfig for TRT-LLM
-RUN find /usr -name libtensorrt_llm.so -exec dirname {} \; > /etc/ld.so.conf.d/tensorrt-llm.conf
-RUN find /opt/tritonserver -name libtritonserver.so -exec dirname {} \; > /etc/ld.so.conf.d/triton-tensorrtllm-worker.conf
-
+# Remove contents that are not needed in runtime
 # Setuptools has breaking changes in version 70.0.0, so fix it to 69.5.1
 # The generated code in grpc_service_pb2_grpc.py depends on grpcio>=1.64.0, so fix it to 1.64.0
-RUN pip3 install setuptools==69.5.1 grpcio-tools==1.64.0
+RUN ldconfig && \
+    ARCH="$(uname -i)" && \
+    rm -fr ${TRT_ROOT}/bin ${TRT_ROOT}/targets/${ARCH}-linux-gnu/bin ${TRT_ROOT}/data && \
+    rm -fr ${TRT_ROOT}/doc ${TRT_ROOT}/onnx_graphsurgeon ${TRT_ROOT}/python && \
+    rm -fr ${TRT_ROOT}/samples ${TRT_ROOT}/targets/${ARCH}-linux-gnu/samples && \
+    python3 -m pip install --upgrade pip && \
+    pip3 install --no-cache-dir transformers && \
+    find /usr -name libtensorrt_llm.so -exec dirname {} \; > /etc/ld.so.conf.d/tensorrt-llm.conf && \
+    find /opt/tritonserver -name libtritonserver.so -exec dirname {} \; > /etc/ld.so.conf.d/triton-tensorrtllm-worker.conf && \
+    pip3 install --no-cache-dir setuptools==69.5.1 grpcio-tools==1.64.0
 
 ENV LD_LIBRARY_PATH=/usr/local/tensorrt/lib/:/opt/tritonserver/backends/tensorrtllm:$LD_LIBRARY_PATH
 """
 
@@ -1,5 +1,5 @@
 <!--
-# Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright 2023-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
 # modification, are permitted provided that the following conditions
@@ -87,10 +87,12 @@ return an error.
 
     $generate_request =
     {
+      "id" : $string, #optional
       "text_input" : $string,
       "parameters" : $parameters #optional
     }
 
+* "id": An identifier for this request. Optional, but if specified this identifier must be returned in the response.
 * "text_input" : The text input that the model should generate output from.
 * "parameters" : An optional object containing zero or more parameters for this
   generate request expressed as key/value pairs. See
@@ -121,14 +123,15 @@ specification to set the parameters.
 Below is an example to send generate request with additional model parameters `stream` and `temperature`.
 
 ```
-$ curl -X POST localhost:8000/v2/models/mymodel/generate -d '{"text_input": "client input", "parameters": {"stream": false, "temperature": 0}}'
+$ curl -X POST localhost:8000/v2/models/mymodel/generate -d '{"id": "42", "text_input": "client input", "parameters": {"stream": false, "temperature": 0}}'
 
 POST /v2/models/mymodel/generate HTTP/1.1
 Host: localhost:8000
 Content-Type: application/json
 Content-Length: <xx>
 {
-  "text_input":  "client input",
+  "id" : "42",
+  "text_input" :  "client input",
   "parameters" :
     {
       "stream": false,
@@ -145,11 +148,13 @@ the HTTP body.
 
     $generate_response =
     {
+      "id" : $string
       "model_name" : $string,
       "model_version" : $string,
       "text_output" : $string
     }
 
+* "id" : The "id" identifier given in the request, if any.
 * "model_name" : The name of the model used for inference.
 * "model_version" : The specific model version used for inference.
 * "text_output" : The output of the inference.
@@ -159,6 +164,7 @@ the HTTP body.
 ```
 200
 {
+  "id" : "42"
   "model_name" : "mymodel",
   "model_version" : "1",
   "text_output" : "model output"
 
@@ -100,6 +100,25 @@ Count*. The count metrics are illustrated by the following examples:
 |              |Execution Count |`nv_inference_exec_count` |Number of inference batch executions (see [Inference Request Metrics](#inference-request-metrics), does not include cached requests)|Per model|Per request|
 |              |Pending Request Count |`nv_inference_pending_request_count` |Number of inference requests awaiting execution by a backend. This number is incremented when a request is enqueued to the server (`TRITONSERVER_ServerInferAsync`) and is decremented when a backend is about to start executing the request. More details can be found below. |Per model|Per request|
 
+#### Failure Count Categories
+
+| Failed Request Reason |Description |
+|------------|------------|
+| REJECTED  | Number of inference failures due to request timeout in the schedular. |
+| CANCELED  |  Number of inference failures due to request cancellation in the core. |
+| BACKEND |  Number of inference failures during execution of requests in the backend/model. |
+| OTHER  | Number of inference failures due to other uncategorized reasons in the core. |
+
+> **Note**
+>
+> Ensemble failure metrics will reflect the failure counts of their composing models as well as the parent model, but currently do not capture the same granularity for the "reason" label and will default to the "OTHER" reason.
+>
+> For example, if EnsembleA contains ModelA, and ModelA experiences a failed request due to a queue/backlog timeout in the scheduler, ModelA will have a failed request metric reflecting `reason=REJECTED` and `count=1`.
+> Additionally, EnsembleA will have a failed request metric reflecting `reason=OTHER` and `count=2`.
+> The `count=2` reflects 1 from the internally failed request captured by ModelA, as well as 1 from the failed top-level request sent to EnsembleA by the user/client.
+> The `reason=OTHER` reflects that fact that the ensemble doesn't currently capture the specific reason why
+> ModelA's request failed at this time.
+
 #### Pending Request Count (Queue Size) Per-Model
 
 The *Pending Request Count* reflects the number of requests that have been
 
@@ -623,6 +623,22 @@ Then, you can specify headers in the `infer` method. For references, please
 look at our [tests](https://github.com/triton-inference-server/server/blob/main/qa/L0_trace/opentelemetry_unittest.py),
 e.g. [http context propagation test](https://github.com/triton-inference-server/server/blob/main/qa/L0_trace/opentelemetry_unittest.py#L494-L508).
 
+### Custom Backend Tracing
+
+In the case when a custom activity needs to be traced in the backend, please
+use `TRITONSERVER_InferenceTraceReportActivity` API. For examples, please
+refer to the [identity backend](https://github.com/triton-inference-server/identity_backend/blob/main/src/identity.cc).
+
+In `openTelemetry` trace mode, if one wishes to start a new span, make sure
+that the name of your custom activity ends with `_START`. To end the new span,
+make sure that corresponding activity ends with `_END`. For example, in the
+identity backend, we start a `CUSTOM_ACTIVITY` span, by [reporting](https://github.com/triton-inference-server/identity_backend/blob/oandreeva-custom-trace-activity/src/identity.cc#L872-L876)
+`CUSTOM_ACTIVITY_START` event; and we close this span by [reporting](https://github.com/triton-inference-server/identity_backend/blob/oandreeva-custom-trace-activity/src/identity.cc#L880-L883)
+`CUSTOM_ACTIVITY_END` event.
+
+Please note, that it is user's responsibility to make sure that all custom started
+spans are properly ended.
+
 ### Limitations
 
 - OpenTelemetry trace mode is not supported on Windows systems.
 
@@ -253,6 +253,7 @@ run_server
 if [ "$SERVER_PID" == "0" ]; then
     echo -e "\n***\n*** Failed to start $SERVER\n***"
     cat $SERVER_LOG
+    aws s3 rb "${BUCKET_URL}" --force || true
     exit 1
 fi
 
@@ -286,6 +287,7 @@ run_server
 if [ "$SERVER_PID" == "0" ]; then
     echo -e "\n***\n*** Failed to start $SERVER\n***"
     cat $SERVER_LOG
+    aws s3 rb "${BUCKET_URL}" --force || true
     exit 1
 fi
 
 
@@ -27,8 +27,11 @@
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 import os
+import re
 import sys
 
+import requests
+
 sys.path.append("../../common")
 
 import queue
@@ -63,6 +66,29 @@ class LifecycleTest(unittest.TestCase):
     def setUp(self):
         self._shm_leak_detector = shm_util.ShmLeakDetector()
 
+    def _get_metrics(self):
+        metrics_url = "http://localhost:8002/metrics"
+        r = requests.get(metrics_url)
+        r.raise_for_status()
+        return r.text
+
+    def _metrics_before_test(self, model, reason):
+        pattern = rf'nv_inference_request_failure\{{model="{model}",reason="{reason}",version="1"\}} (\d+)'
+        metrics = self._get_metrics()
+        match = re.search(pattern, metrics)
+        if match:
+            return int(match.group(1))
+        else:
+            raise Exception(f"Failure metrics for model='{model}' not found")
+
+    def _assert_metrics(
+        self, model_name, reason, expected_count_increase, initial_count
+    ):
+        metrics = self._get_metrics()
+        # Add initial count + expected count for the the test
+        expected_metric = f'nv_inference_request_failure{{model="{model_name}",reason="{reason}",version="1"}} {expected_count_increase + initial_count}'
+        self.assertIn(expected_metric, metrics)
+
     def test_error_code(self):
         model_name = "error_code"
         shape = [1, 1]
@@ -181,7 +207,7 @@ def test_batch_error(self):
     def test_infer_pymodel_error(self):
         model_name = "wrong_model"
         shape = [2, 2]
-
+        initial_metrics_value = self._metrics_before_test(model_name, "BACKEND")
         with self._shm_leak_detector.Probe() as shm_probe:
             with httpclient.InferenceServerClient(
                 f"{_tritonserver_ipaddr}:8000"
@@ -207,6 +233,13 @@ def test_infer_pymodel_error(self):
                     self.assertTrue(
                         False, "Wrong exception raised or did not raise an exception"
                     )
+        expected_count_increase = 1
+        self._assert_metrics(
+            model_name,
+            "BACKEND",
+            expected_count_increase,
+            initial_metrics_value,
+        )
 
 
 if __name__ == "__main__":
 
@@ -437,10 +437,10 @@ def test_simple_infer_error_status(self):
 
     def test_simple_infer_shutdownserver(self):
         # This test case is used to check whether all the state objects are
-        # released when the server is interrupted to shutdown in middle of
-        # inference run with final parameters being returned.
+        # released when the server is interrupted to shutdown in the beginning
+        # of inference run with final parameters being returned.
         with self.assertRaises(InferenceServerException) as cm:
-            self._simple_infer(request_count=10, kill_server=5)
+            self._simple_infer(request_count=20, kill_server=5)
 
     ###
     ### Streaming Tests
@@ -469,11 +469,18 @@ def test_streaming_timeout(self):
     def test_streaming_error_status(self):
         # This test case is used to check whether all the state objects are
         # released when RPC runs into error.
+        expected_exceptions = [
+            "This protocol is restricted, expecting header 'triton-grpc-protocol-infer-key'",
+            "The stream is no longer in valid state, the error detail is reported through provided callback. A new stream should be started after stopping the current stream.",
+        ]
         with self.assertRaises(InferenceServerException) as cm:
             self._streaming_infer(request_count=10, should_error=True)
-        self.assertIn(
-            "This protocol is restricted, expecting header 'triton-grpc-protocol-infer-key'",
-            str(cm.exception),
+
+        exception_match = False
+        for expected_exception in expected_exceptions:
+            exception_match |= expected_exception in str(cm.exception)
+        self.assertTrue(
+            exception_match, "Raised unexpected exception {}".format(str(cm.exception))
         )
 
     def test_streaming_infer_shutdownserver(self):
@@ -520,11 +527,18 @@ def test_decoupled_timeout(self):
     def test_decoupled_error_status(self):
         # This test case is used to check whether all the state objects are
         # released when RPC runs into error.
+        expected_exceptions = [
+            "This protocol is restricted, expecting header 'triton-grpc-protocol-infer-key'",
+            "The stream is no longer in valid state, the error detail is reported through provided callback. A new stream should be started after stopping the current stream.",
+        ]
         with self.assertRaises(InferenceServerException) as cm:
             self._decoupled_infer(request_count=10, repeat_count=10, should_error=True)
-        self.assertIn(
-            "This protocol is restricted, expecting header 'triton-grpc-protocol-infer-key'",
-            str(cm.exception),
+
+        exception_match = False
+        for expected_exception in expected_exceptions:
+            exception_match |= expected_exception in str(cm.exception)
+        self.assertTrue(
+            exception_match, "Raised unexpected exception {}".format(str(cm.exception))
         )
 
     def test_decoupled_infer_shutdownserver(self):
 
@@ -142,6 +142,49 @@ def test_generate(self):
         self.assertIn("TEXT", data)
         self.assertEqual(text, data["TEXT"])
 
+    def test_request_id(self):
+        # Setup text based input
+        text = "hello world"
+        request_id = "42"
+
+        # Test when request id in request body
+        inputs = {"PROMPT": text, "id": request_id, "STREAM": False}
+        r = self.generate(self._model_name, inputs)
+        r.raise_for_status()
+
+        self.assertIn("Content-Type", r.headers)
+        self.assertEqual(r.headers["Content-Type"], "application/json")
+
+        data = r.json()
+        self.assertIn("id", data)
+        self.assertEqual(request_id, data["id"])
+        self.assertIn("TEXT", data)
+        self.assertEqual(text, data["TEXT"])
+
+        # Test when request id not in request body
+        inputs = {"PROMPT": text, "STREAM": False}
+        r = self.generate(self._model_name, inputs)
+        r.raise_for_status()
+
+        self.assertIn("Content-Type", r.headers)
+        self.assertEqual(r.headers["Content-Type"], "application/json")
+
+        data = r.json()
+        self.assertNotIn("id", data)
+
+        # Test when request id is empty
+        inputs = {"PROMPT": text, "id": "", "STREAM": False}
+        r = self.generate(self._model_name, inputs)
+        r.raise_for_status()
+
+        self.assertIn("Content-Type", r.headers)
+        self.assertEqual(r.headers["Content-Type"], "application/json")
+
+        data = r.json()
+        self.assertNotIn("id", data)
+        self.assertIn("TEXT", data)
+        self.assertEqual(text, data["TEXT"])
+
     def test_generate_stream(self):
         # Setup text-based input
         text = "hello world"
 
@@ -662,7 +662,7 @@ fi
 ## Python Unit Tests
 TEST_RESULT_FILE='test_results.txt'
 PYTHON_TEST=generate_endpoint_test.py
-EXPECTED_NUM_TESTS=15
+EXPECTED_NUM_TESTS=16
 set +e
 python $PYTHON_TEST >$CLIENT_LOG 2>&1
 if [ $? -ne 0 ]; then
Original file line number	Diff line number	Diff line change
`@@ -1,5 +1,5 @@`
`1`	`1`	`<!--`
`2`		`-# Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.`
	`2`	`+# Copyright 2023-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.`
`3`	`3`	`#`
`4`	`4`	`# Redistribution and use in source and binary forms, with or without`
`5`	`5`	`# modification, are permitted provided that the following conditions`
`@@ -87,10 +87,12 @@ return an error.`
`87`	`87`
`88`	`88`	`$generate_request =`
`89`	`89`	`{`
	`90`	`+ "id" : $string, #optional`
`90`	`91`	`"text_input" : $string,`
`91`	`92`	`"parameters" : $parameters #optional`
`92`	`93`	`}`
`93`	`94`
	`95`	`+* "id": An identifier for this request. Optional, but if specified this identifier must be returned in the response.`
`94`	`96`	`* "text_input" : The text input that the model should generate output from.`
`95`	`97`	`* "parameters" : An optional object containing zero or more parameters for this`
`96`	`98`	`generate request expressed as key/value pairs. See`
`@@ -121,14 +123,15 @@ specification to set the parameters.`
`121`	`123`	Below is an example to send generate request with additional model parameters `stream` and `temperature`.
`122`	`124`
`123`	`125`	```
`124`		`-$ curl -X POST localhost:8000/v2/models/mymodel/generate -d '{"text_input": "client input", "parameters": {"stream": false, "temperature": 0}}'`
	`126`	`+$ curl -X POST localhost:8000/v2/models/mymodel/generate -d '{"id": "42", "text_input": "client input", "parameters": {"stream": false, "temperature": 0}}'`
`125`	`127`
`126`	`128`	`POST /v2/models/mymodel/generate HTTP/1.1`
`127`	`129`	`Host: localhost:8000`
`128`	`130`	`Content-Type: application/json`
`129`	`131`	`Content-Length: <xx>`
`130`	`132`	`{`
`131`		`- "text_input": "client input",`
	`133`	`+ "id" : "42",`
	`134`	`+ "text_input" : "client input",`
`132`	`135`	`"parameters" :`
`133`	`136`	`{`
`134`	`137`	`"stream": false,`
`@@ -145,11 +148,13 @@ the HTTP body.`
`145`	`148`
`146`	`149`	`$generate_response =`
`147`	`150`	`{`
	`151`	`+ "id" : $string`
`148`	`152`	`"model_name" : $string,`
`149`	`153`	`"model_version" : $string,`
`150`	`154`	`"text_output" : $string`
`151`	`155`	`}`
`152`	`156`
	`157`	`+* "id" : The "id" identifier given in the request, if any.`
`153`	`158`	`* "model_name" : The name of the model used for inference.`
`154`	`159`	`* "model_version" : The specific model version used for inference.`
`155`	`160`	`* "text_output" : The output of the inference.`
`@@ -159,6 +164,7 @@ the HTTP body.`
`159`	`164`	```
`160`	`165`	`200`
`161`	`166`	`{`
	`167`	`+ "id" : "42"`
`162`	`168`	`"model_name" : "mymodel",`
`163`	`169`	`"model_version" : "1",`
`164`	`170`	`"text_output" : "model output"`