Update docs and messages

kthui · kthui · commit b6bd6496627f · 2024-11-26T15:04:59.000-08:00
diff --git a/ci/L0_check_health_vllm/check_health_test.py b/ci/L0_check_health_vllm/check_health_test.py
@@ -117,7 +117,7 @@ def test_vllm_not_healthy(self):
         # The 2nd infer should begin with health check failed
         self._llm_infer()
         self._assert_infer_exception(
-            "vLLM engine is not healthy and model will be unloaded"
+            "Model is unavailable due to unhealthy vLLM engine"
         )
         self._assert_model_ready(False)
         # The 3rd infer should have model not found
diff --git a/docs/health_check.md b/docs/health_check.md
@@ -35,9 +35,8 @@
 
 The vLLM backend supports checking for
 [vLLM Engine Health](https://github.com/vllm-project/vllm/blob/v0.6.3.post1/vllm/engine/async_llm_engine.py#L1177-L1185)
-upon receiving each inference request. If the health check fails, the entire
-model will be unloaded, so its state becomes NOT Ready at the server, which can
-be queried by the
+upon receiving each inference request. If the health check fails, the model
+state will becomes NOT Ready at the server, which can be queried by the
 [Repository Index](https://github.com/triton-inference-server/server/blob/main/docs/protocol/extension_model_repository.md#index)
 or
 [Model Ready](https://github.com/triton-inference-server/client/blob/main/src/c%2B%2B/library/http_client.h#L178-L192)
@@ -54,5 +53,3 @@ parameters: {
 and select
 [Model Control Mode EXPLICIT](https://github.com/triton-inference-server/server/blob/main/docs/user_guide/model_management.md#model-control-mode-explicit)
 when the server is started.
-
-Supported since r24.12.
diff --git a/src/model.py b/src/model.py
@@ -701,7 +701,7 @@ def _check_health(self, requests):
                 request.get_response_sender().send(
                     pb_utils.InferenceResponse(
                         error=pb_utils.TritonError(
-                            message="vLLM engine is not healthy and model will be unloaded",
+                            message="Model is unavailable due to unhealthy vLLM engine",
                             code=pb_utils.TritonError.UNAVAILABLE,
                         )
                     ),

Original file line number	Diff line number	Diff line change
`@@ -117,7 +117,7 @@ def test_vllm_not_healthy(self):`
`117`	`117`	`# The 2nd infer should begin with health check failed`
`118`	`118`	`self._llm_infer()`
`119`	`119`	`self._assert_infer_exception(`
`120`		`- "vLLM engine is not healthy and model will be unloaded"`
	`120`	`+ "Model is unavailable due to unhealthy vLLM engine"`
`121`	`121`	`)`
`122`	`122`	`self._assert_model_ready(False)`
`123`	`123`	`# The 3rd infer should have model not found`
Original file line number	Diff line number	Diff line change
`@@ -701,7 +701,7 @@ def _check_health(self, requests):`
`701`	`701`	`request.get_response_sender().send(`
`702`	`702`	`pb_utils.InferenceResponse(`
`703`	`703`	`error=pb_utils.TritonError(`
`704`		`- message="vLLM engine is not healthy and model will be unloaded",`
	`704`	`+ message="Model is unavailable due to unhealthy vLLM engine",`
`705`	`705`	`code=pb_utils.TritonError.UNAVAILABLE,`
`706`	`706`	`)`
`707`	`707`	`),`