⬆️ Update to allow vllm 0.8.5 (#69)

evaline-ju · web-flow · commit 45931650bf9e · 2025-05-09T13:14:52.000-06:00
* ⬆️ Bump upper bound for 0.8.5 vllm

Signed-off-by: Evaline Ju &lt;69598118+evaline-ju@users.noreply.github.com&gt;

* ♻️ Backwards compatible import for nullable_str

Signed-off-by: Evaline Ju &lt;69598118+evaline-ju@users.noreply.github.com&gt;

* ♻️ Account for init_app_state with vllm config

Signed-off-by: Evaline Ju &lt;69598118+evaline-ju@users.noreply.github.com&gt;

* ♻️ Keep ordering

Signed-off-by: Evaline Ju &lt;69598118+evaline-ju@users.noreply.github.com&gt;

* 👽 Optional type change

Signed-off-by: Evaline Ju &lt;69598118+evaline-ju@users.noreply.github.com&gt;

---------

Signed-off-by: Evaline Ju &lt;69598118+evaline-ju@users.noreply.github.com&gt;
diff --git a/Dockerfile b/Dockerfile
@@ -8,7 +8,7 @@ ARG BASE_UBI_IMAGE_TAG=9.5
 ARG PYTHON_VERSION=3.12
 
 ### Build layer
-FROM quay.io/vllm/vllm:0.8.4.20250423 as build
+FROM quay.io/vllm/vllm:0.8.5.0_cu128 as build
 
 ARG PYTHON_VERSION
 ENV PYTHON_VERSION=${PYTHON_VERSION}
diff --git a/pyproject.toml b/pyproject.toml
@@ -22,8 +22,8 @@ vllm-tgis-adapter = [
 ]
 vllm = [
     # Note: 0.8.4 has a triton bug on Mac
-    "vllm @ git+https://github.com/vllm-project/vllm.git@v0.8.3 ; sys_platform == 'darwin'",
-    "vllm>=0.7.2,<0.8.5 ; sys_platform != 'darwin'",
+    "vllm @ git+https://github.com/vllm-project/vllm.git@v0.8.5 ; sys_platform == 'darwin'",
+    "vllm>=0.7.2,<0.8.6 ; sys_platform != 'darwin'",
 ]
 
 ## Dev Extra Sets ##
diff --git a/vllm_detector_adapter/api_server.py b/vllm_detector_adapter/api_server.py
@@ -8,7 +8,6 @@
 from fastapi.responses import JSONResponse
 from starlette.datastructures import State
 from vllm.config import ModelConfig
-from vllm.engine.arg_utils import nullable_str
 from vllm.engine.protocol import EngineClient
 from vllm.entrypoints.chat_utils import load_chat_template
 from vllm.entrypoints.launcher import serve_http
@@ -59,7 +58,7 @@ def chat_detection(
 
 async def init_app_state_with_detectors(
     engine_client: EngineClient,
-    model_config: ModelConfig,
+    config,  # ModelConfig | VllmConfig
     state: State,
     args: Namespace,
 ) -> None:
@@ -79,6 +78,11 @@ async def init_app_state_with_detectors(
     ]
 
     resolved_chat_template = load_chat_template(args.chat_template)
+
+    model_config = config
+    if type(config) != ModelConfig:  # VllmConfig
+        model_config = config.model_config
+
     state.openai_serving_models = OpenAIServingModels(
         engine_client=engine_client,
         model_config=model_config,
@@ -90,9 +94,7 @@ async def init_app_state_with_detectors(
     # Use vllm app state init
     # init_app_state became async in https://github.com/vllm-project/vllm/pull/11727
     # ref. https://github.com/opendatahub-io/vllm-tgis-adapter/pull/207
-    maybe_coroutine = api_server.init_app_state(
-        engine_client, model_config, state, args
-    )
+    maybe_coroutine = api_server.init_app_state(engine_client, config, state, args)
     if inspect.isawaitable(maybe_coroutine):
         await maybe_coroutine
 
@@ -161,10 +163,18 @@ def signal_handler(*_) -> None:
         # Use vllm build_app which adds middleware
         app = api_server.build_app(args)
 
-        model_config = await engine_client.get_model_config()
-        await init_app_state_with_detectors(
-            engine_client, model_config, app.state, args
-        )
+        # api_server.init_app_state takes vllm_config
+        # ref. https://github.com/vllm-project/vllm/pull/16572
+        if hasattr(engine_client, "get_vllm_config"):
+            vllm_config = await engine_client.get_vllm_config()
+            await init_app_state_with_detectors(
+                engine_client, vllm_config, app.state, args
+            )
+        else:
+            model_config = await engine_client.get_model_config()
+            await init_app_state_with_detectors(
+                engine_client, model_config, app.state, args
+            )
 
         def _listen_addr(a: str) -> str:
             if is_valid_ipv6_address(a):
@@ -280,17 +290,30 @@ async def create_generation_detection(
 
 
 def add_chat_detection_params(parser):
+
+    template_type = None
+    try:
+        # Third Party
+        from vllm.engine.arg_utils import nullable_str
+
+        template_type = nullable_str
+    except ImportError:
+        # Third Party
+        from vllm.engine.arg_utils import optional_type
+
+        template_type = optional_type(str)
+
     parser.add_argument(
         "--task-template",
-        type=nullable_str,
+        type=template_type,
         default=None,
         help="The file path to the task template, "
         "or the template in single-line form "
         "for the specified model",
     )
     parser.add_argument(
         "--output-template",
-        type=nullable_str,
+        type=template_type,
         default=None,
         help="The file path to the output template, "
         "or the template in single-line form "

Original file line number	Diff line number	Diff line change
`@@ -22,8 +22,8 @@ vllm-tgis-adapter = [`
`22`	`22`	`]`
`23`	`23`	`vllm = [`
`24`	`24`	`# Note: 0.8.4 has a triton bug on Mac`
`25`		`- "vllm @ git+https://github.com/vllm-project/vllm.git@v0.8.3 ; sys_platform == 'darwin'",`
`26`		`- "vllm>=0.7.2,<0.8.5 ; sys_platform != 'darwin'",`
	`25`	`+ "vllm @ git+https://github.com/vllm-project/vllm.git@v0.8.5 ; sys_platform == 'darwin'",`
	`26`	`+ "vllm>=0.7.2,<0.8.6 ; sys_platform != 'darwin'",`
`27`	`27`	`]`
`28`	`28`
`29`	`29`	`## Dev Extra Sets ##`