Skip to content

Commit 8d5f411

Browse files
committed
Merge branch 'main' of github.com:triton-inference-server/server into yinggeh-DLIS-6657-client-input-byte-size-check
2 parents 39715f4 + 70a0eee commit 8d5f411

File tree

20 files changed

+732
-263
lines changed

20 files changed

+732
-263
lines changed

build.py

Lines changed: 12 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -76,7 +76,7 @@
7676
"2024.0.0", # ORT OpenVINO
7777
"2024.0.0", # Standalone OpenVINO
7878
"3.2.6", # DCGM version
79-
"0.4.3", # vLLM version
79+
"0.5.0.post1", # vLLM version
8080
)
8181
}
8282

@@ -1082,25 +1082,20 @@ def create_dockerfile_linux(
10821082
"""
10831083
if "tensorrtllm" in backends:
10841084
df += """
1085-
1086-
RUN ldconfig
1087-
# Remove contents that are not needed in runtime
1088-
RUN ARCH="$(uname -i)" \\
1089-
&& rm -fr ${TRT_ROOT}/bin ${TRT_ROOT}/targets/${ARCH}-linux-gnu/bin ${TRT_ROOT}/data \\
1090-
&& rm -fr ${TRT_ROOT}/doc ${TRT_ROOT}/onnx_graphsurgeon ${TRT_ROOT}/python \\
1091-
&& rm -fr ${TRT_ROOT}/samples ${TRT_ROOT}/targets/${ARCH}-linux-gnu/samples
1092-
10931085
# Install required packages for TRT-LLM models
1094-
RUN python3 -m pip install --upgrade pip \\
1095-
&& pip3 install transformers
1096-
1097-
# ldconfig for TRT-LLM
1098-
RUN find /usr -name libtensorrt_llm.so -exec dirname {} \; > /etc/ld.so.conf.d/tensorrt-llm.conf
1099-
RUN find /opt/tritonserver -name libtritonserver.so -exec dirname {} \; > /etc/ld.so.conf.d/triton-tensorrtllm-worker.conf
1100-
1086+
# Remove contents that are not needed in runtime
11011087
# Setuptools has breaking changes in version 70.0.0, so fix it to 69.5.1
11021088
# The generated code in grpc_service_pb2_grpc.py depends on grpcio>=1.64.0, so fix it to 1.64.0
1103-
RUN pip3 install setuptools==69.5.1 grpcio-tools==1.64.0
1089+
RUN ldconfig && \
1090+
ARCH="$(uname -i)" && \
1091+
rm -fr ${TRT_ROOT}/bin ${TRT_ROOT}/targets/${ARCH}-linux-gnu/bin ${TRT_ROOT}/data && \
1092+
rm -fr ${TRT_ROOT}/doc ${TRT_ROOT}/onnx_graphsurgeon ${TRT_ROOT}/python && \
1093+
rm -fr ${TRT_ROOT}/samples ${TRT_ROOT}/targets/${ARCH}-linux-gnu/samples && \
1094+
python3 -m pip install --upgrade pip && \
1095+
pip3 install --no-cache-dir transformers && \
1096+
find /usr -name libtensorrt_llm.so -exec dirname {} \; > /etc/ld.so.conf.d/tensorrt-llm.conf && \
1097+
find /opt/tritonserver -name libtritonserver.so -exec dirname {} \; > /etc/ld.so.conf.d/triton-tensorrtllm-worker.conf && \
1098+
pip3 install --no-cache-dir setuptools==69.5.1 grpcio-tools==1.64.0
11041099
11051100
ENV LD_LIBRARY_PATH=/usr/local/tensorrt/lib/:/opt/tritonserver/backends/tensorrtllm:$LD_LIBRARY_PATH
11061101
"""

docs/protocol/extension_generate.md

Lines changed: 9 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
<!--
2-
# Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2+
# Copyright 2023-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
33
#
44
# Redistribution and use in source and binary forms, with or without
55
# modification, are permitted provided that the following conditions
@@ -87,10 +87,12 @@ return an error.
8787

8888
$generate_request =
8989
{
90+
"id" : $string, #optional
9091
"text_input" : $string,
9192
"parameters" : $parameters #optional
9293
}
9394

95+
* "id": An identifier for this request. Optional, but if specified this identifier must be returned in the response.
9496
* "text_input" : The text input that the model should generate output from.
9597
* "parameters" : An optional object containing zero or more parameters for this
9698
generate request expressed as key/value pairs. See
@@ -121,14 +123,15 @@ specification to set the parameters.
121123
Below is an example to send generate request with additional model parameters `stream` and `temperature`.
122124

123125
```
124-
$ curl -X POST localhost:8000/v2/models/mymodel/generate -d '{"text_input": "client input", "parameters": {"stream": false, "temperature": 0}}'
126+
$ curl -X POST localhost:8000/v2/models/mymodel/generate -d '{"id": "42", "text_input": "client input", "parameters": {"stream": false, "temperature": 0}}'
125127
126128
POST /v2/models/mymodel/generate HTTP/1.1
127129
Host: localhost:8000
128130
Content-Type: application/json
129131
Content-Length: <xx>
130132
{
131-
"text_input": "client input",
133+
"id" : "42",
134+
"text_input" : "client input",
132135
"parameters" :
133136
{
134137
"stream": false,
@@ -145,11 +148,13 @@ the HTTP body.
145148

146149
$generate_response =
147150
{
151+
"id" : $string
148152
"model_name" : $string,
149153
"model_version" : $string,
150154
"text_output" : $string
151155
}
152156

157+
* "id" : The "id" identifier given in the request, if any.
153158
* "model_name" : The name of the model used for inference.
154159
* "model_version" : The specific model version used for inference.
155160
* "text_output" : The output of the inference.
@@ -159,6 +164,7 @@ the HTTP body.
159164
```
160165
200
161166
{
167+
"id" : "42"
162168
"model_name" : "mymodel",
163169
"model_version" : "1",
164170
"text_output" : "model output"

docs/user_guide/metrics.md

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -100,6 +100,25 @@ Count*. The count metrics are illustrated by the following examples:
100100
| |Execution Count |`nv_inference_exec_count` |Number of inference batch executions (see [Inference Request Metrics](#inference-request-metrics), does not include cached requests)|Per model|Per request|
101101
| |Pending Request Count |`nv_inference_pending_request_count` |Number of inference requests awaiting execution by a backend. This number is incremented when a request is enqueued to the server (`TRITONSERVER_ServerInferAsync`) and is decremented when a backend is about to start executing the request. More details can be found below. |Per model|Per request|
102102

103+
#### Failure Count Categories
104+
105+
| Failed Request Reason |Description |
106+
|------------|------------|
107+
| REJECTED | Number of inference failures due to request timeout in the schedular. |
108+
| CANCELED | Number of inference failures due to request cancellation in the core. |
109+
| BACKEND | Number of inference failures during execution of requests in the backend/model. |
110+
| OTHER | Number of inference failures due to other uncategorized reasons in the core. |
111+
112+
> **Note**
113+
>
114+
> Ensemble failure metrics will reflect the failure counts of their composing models as well as the parent model, but currently do not capture the same granularity for the "reason" label and will default to the "OTHER" reason.
115+
>
116+
> For example, if EnsembleA contains ModelA, and ModelA experiences a failed request due to a queue/backlog timeout in the scheduler, ModelA will have a failed request metric reflecting `reason=REJECTED` and `count=1`.
117+
> Additionally, EnsembleA will have a failed request metric reflecting `reason=OTHER` and `count=2`.
118+
> The `count=2` reflects 1 from the internally failed request captured by ModelA, as well as 1 from the failed top-level request sent to EnsembleA by the user/client.
119+
> The `reason=OTHER` reflects that fact that the ensemble doesn't currently capture the specific reason why
120+
> ModelA's request failed at this time.
121+
103122
#### Pending Request Count (Queue Size) Per-Model
104123

105124
The *Pending Request Count* reflects the number of requests that have been

docs/user_guide/trace.md

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -623,6 +623,22 @@ Then, you can specify headers in the `infer` method. For references, please
623623
look at our [tests](https://github.com/triton-inference-server/server/blob/main/qa/L0_trace/opentelemetry_unittest.py),
624624
e.g. [http context propagation test](https://github.com/triton-inference-server/server/blob/main/qa/L0_trace/opentelemetry_unittest.py#L494-L508).
625625

626+
### Custom Backend Tracing
627+
628+
In the case when a custom activity needs to be traced in the backend, please
629+
use `TRITONSERVER_InferenceTraceReportActivity` API. For examples, please
630+
refer to the [identity backend](https://github.com/triton-inference-server/identity_backend/blob/main/src/identity.cc).
631+
632+
In `openTelemetry` trace mode, if one wishes to start a new span, make sure
633+
that the name of your custom activity ends with `_START`. To end the new span,
634+
make sure that corresponding activity ends with `_END`. For example, in the
635+
identity backend, we start a `CUSTOM_ACTIVITY` span, by [reporting](https://github.com/triton-inference-server/identity_backend/blob/oandreeva-custom-trace-activity/src/identity.cc#L872-L876)
636+
`CUSTOM_ACTIVITY_START` event; and we close this span by [reporting](https://github.com/triton-inference-server/identity_backend/blob/oandreeva-custom-trace-activity/src/identity.cc#L880-L883)
637+
`CUSTOM_ACTIVITY_END` event.
638+
639+
Please note, that it is user's responsibility to make sure that all custom started
640+
spans are properly ended.
641+
626642
### Limitations
627643

628644
- OpenTelemetry trace mode is not supported on Windows systems.

qa/L0_backend_python/env/test.sh

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -253,6 +253,7 @@ run_server
253253
if [ "$SERVER_PID" == "0" ]; then
254254
echo -e "\n***\n*** Failed to start $SERVER\n***"
255255
cat $SERVER_LOG
256+
aws s3 rb "${BUCKET_URL}" --force || true
256257
exit 1
257258
fi
258259

@@ -286,6 +287,7 @@ run_server
286287
if [ "$SERVER_PID" == "0" ]; then
287288
echo -e "\n***\n*** Failed to start $SERVER\n***"
288289
cat $SERVER_LOG
290+
aws s3 rb "${BUCKET_URL}" --force || true
289291
exit 1
290292
fi
291293

qa/L0_backend_python/lifecycle/lifecycle_test.py

Lines changed: 34 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -27,8 +27,11 @@
2727
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
2828

2929
import os
30+
import re
3031
import sys
3132

33+
import requests
34+
3235
sys.path.append("../../common")
3336

3437
import queue
@@ -63,6 +66,29 @@ class LifecycleTest(unittest.TestCase):
6366
def setUp(self):
6467
self._shm_leak_detector = shm_util.ShmLeakDetector()
6568

69+
def _get_metrics(self):
70+
metrics_url = "http://localhost:8002/metrics"
71+
r = requests.get(metrics_url)
72+
r.raise_for_status()
73+
return r.text
74+
75+
def _metrics_before_test(self, model, reason):
76+
pattern = rf'nv_inference_request_failure\{{model="{model}",reason="{reason}",version="1"\}} (\d+)'
77+
metrics = self._get_metrics()
78+
match = re.search(pattern, metrics)
79+
if match:
80+
return int(match.group(1))
81+
else:
82+
raise Exception(f"Failure metrics for model='{model}' not found")
83+
84+
def _assert_metrics(
85+
self, model_name, reason, expected_count_increase, initial_count
86+
):
87+
metrics = self._get_metrics()
88+
# Add initial count + expected count for the the test
89+
expected_metric = f'nv_inference_request_failure{{model="{model_name}",reason="{reason}",version="1"}} {expected_count_increase + initial_count}'
90+
self.assertIn(expected_metric, metrics)
91+
6692
def test_error_code(self):
6793
model_name = "error_code"
6894
shape = [1, 1]
@@ -181,7 +207,7 @@ def test_batch_error(self):
181207
def test_infer_pymodel_error(self):
182208
model_name = "wrong_model"
183209
shape = [2, 2]
184-
210+
initial_metrics_value = self._metrics_before_test(model_name, "BACKEND")
185211
with self._shm_leak_detector.Probe() as shm_probe:
186212
with httpclient.InferenceServerClient(
187213
f"{_tritonserver_ipaddr}:8000"
@@ -207,6 +233,13 @@ def test_infer_pymodel_error(self):
207233
self.assertTrue(
208234
False, "Wrong exception raised or did not raise an exception"
209235
)
236+
expected_count_increase = 1
237+
self._assert_metrics(
238+
model_name,
239+
"BACKEND",
240+
expected_count_increase,
241+
initial_metrics_value,
242+
)
210243

211244

212245
if __name__ == "__main__":

qa/L0_grpc_state_cleanup/cleanup_test.py

Lines changed: 23 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -437,10 +437,10 @@ def test_simple_infer_error_status(self):
437437

438438
def test_simple_infer_shutdownserver(self):
439439
# This test case is used to check whether all the state objects are
440-
# released when the server is interrupted to shutdown in middle of
441-
# inference run with final parameters being returned.
440+
# released when the server is interrupted to shutdown in the beginning
441+
# of inference run with final parameters being returned.
442442
with self.assertRaises(InferenceServerException) as cm:
443-
self._simple_infer(request_count=10, kill_server=5)
443+
self._simple_infer(request_count=20, kill_server=5)
444444

445445
###
446446
### Streaming Tests
@@ -469,11 +469,18 @@ def test_streaming_timeout(self):
469469
def test_streaming_error_status(self):
470470
# This test case is used to check whether all the state objects are
471471
# released when RPC runs into error.
472+
expected_exceptions = [
473+
"This protocol is restricted, expecting header 'triton-grpc-protocol-infer-key'",
474+
"The stream is no longer in valid state, the error detail is reported through provided callback. A new stream should be started after stopping the current stream.",
475+
]
472476
with self.assertRaises(InferenceServerException) as cm:
473477
self._streaming_infer(request_count=10, should_error=True)
474-
self.assertIn(
475-
"This protocol is restricted, expecting header 'triton-grpc-protocol-infer-key'",
476-
str(cm.exception),
478+
479+
exception_match = False
480+
for expected_exception in expected_exceptions:
481+
exception_match |= expected_exception in str(cm.exception)
482+
self.assertTrue(
483+
exception_match, "Raised unexpected exception {}".format(str(cm.exception))
477484
)
478485

479486
def test_streaming_infer_shutdownserver(self):
@@ -520,11 +527,18 @@ def test_decoupled_timeout(self):
520527
def test_decoupled_error_status(self):
521528
# This test case is used to check whether all the state objects are
522529
# released when RPC runs into error.
530+
expected_exceptions = [
531+
"This protocol is restricted, expecting header 'triton-grpc-protocol-infer-key'",
532+
"The stream is no longer in valid state, the error detail is reported through provided callback. A new stream should be started after stopping the current stream.",
533+
]
523534
with self.assertRaises(InferenceServerException) as cm:
524535
self._decoupled_infer(request_count=10, repeat_count=10, should_error=True)
525-
self.assertIn(
526-
"This protocol is restricted, expecting header 'triton-grpc-protocol-infer-key'",
527-
str(cm.exception),
536+
537+
exception_match = False
538+
for expected_exception in expected_exceptions:
539+
exception_match |= expected_exception in str(cm.exception)
540+
self.assertTrue(
541+
exception_match, "Raised unexpected exception {}".format(str(cm.exception))
528542
)
529543

530544
def test_decoupled_infer_shutdownserver(self):

qa/L0_http/generate_endpoint_test.py

Lines changed: 43 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -142,6 +142,49 @@ def test_generate(self):
142142
self.assertIn("TEXT", data)
143143
self.assertEqual(text, data["TEXT"])
144144

145+
def test_request_id(self):
146+
# Setup text based input
147+
text = "hello world"
148+
request_id = "42"
149+
150+
# Test when request id in request body
151+
inputs = {"PROMPT": text, "id": request_id, "STREAM": False}
152+
r = self.generate(self._model_name, inputs)
153+
r.raise_for_status()
154+
155+
self.assertIn("Content-Type", r.headers)
156+
self.assertEqual(r.headers["Content-Type"], "application/json")
157+
158+
data = r.json()
159+
self.assertIn("id", data)
160+
self.assertEqual(request_id, data["id"])
161+
self.assertIn("TEXT", data)
162+
self.assertEqual(text, data["TEXT"])
163+
164+
# Test when request id not in request body
165+
inputs = {"PROMPT": text, "STREAM": False}
166+
r = self.generate(self._model_name, inputs)
167+
r.raise_for_status()
168+
169+
self.assertIn("Content-Type", r.headers)
170+
self.assertEqual(r.headers["Content-Type"], "application/json")
171+
172+
data = r.json()
173+
self.assertNotIn("id", data)
174+
175+
# Test when request id is empty
176+
inputs = {"PROMPT": text, "id": "", "STREAM": False}
177+
r = self.generate(self._model_name, inputs)
178+
r.raise_for_status()
179+
180+
self.assertIn("Content-Type", r.headers)
181+
self.assertEqual(r.headers["Content-Type"], "application/json")
182+
183+
data = r.json()
184+
self.assertNotIn("id", data)
185+
self.assertIn("TEXT", data)
186+
self.assertEqual(text, data["TEXT"])
187+
145188
def test_generate_stream(self):
146189
# Setup text-based input
147190
text = "hello world"

qa/L0_http/test.sh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -662,7 +662,7 @@ fi
662662
## Python Unit Tests
663663
TEST_RESULT_FILE='test_results.txt'
664664
PYTHON_TEST=generate_endpoint_test.py
665-
EXPECTED_NUM_TESTS=15
665+
EXPECTED_NUM_TESTS=16
666666
set +e
667667
python $PYTHON_TEST >$CLIENT_LOG 2>&1
668668
if [ $? -ne 0 ]; then

0 commit comments

Comments
 (0)