Bump vllm to v0.5.0.post1 (#547)

dmchoiboi · web-flow · commit 0ff18246ee03 · 2024-06-24T09:45:36.000-07:00
diff --git a/model-engine/model_engine_server/inference/vllm/Dockerfile b/model-engine/model_engine_server/inference/vllm/Dockerfile
@@ -1,37 +1,3 @@
-#################### BASE BUILD IMAGE ####################
-FROM nvidia/cuda:12.1.0-devel-ubuntu22.04 AS dev
-RUN apt-get update -y \
-    && apt-get install -y python3-pip git
-# Workaround for https://github.com/openai/triton/issues/2507 and
-# https://github.com/pytorch/pytorch/issues/107960 -- hopefully
-# this won't be needed for future versions of this docker image
-# or future versions of triton.
-RUN ldconfig /usr/local/cuda-12.1/compat/
-WORKDIR /workspace
-
-COPY requirements-build.txt requirements-build.txt
-RUN --mount=type=cache,target=/root/.cache/pip \
-    pip install -r requirements-build.txt
-#################### BASE BUILD IMAGE ####################
-
-#################### FLASH_ATTENTION Build IMAGE ####################
-FROM dev as flash-attn-builder
-# max jobs used for build
-ARG max_jobs=2
-ENV MAX_JOBS=${max_jobs}
-# flash attention version
-ARG flash_attn_version=v2.4.2
-ENV FLASH_ATTN_VERSION=${flash_attn_version}
-
-WORKDIR /usr/src/flash-attention-v2
-
-# Download the wheel or build it if a pre-compiled release doesn't exist
-RUN pip --verbose wheel flash-attn==${FLASH_ATTN_VERSION} \
-    --no-build-isolation --no-deps --no-cache-dir
-
-#################### FLASH_ATTENTION Build IMAGE ####################
-
-#################### Runtime IMAGE ####################
 FROM nvcr.io/nvidia/pytorch:23.09-py3
 
 RUN apt-get update \
@@ -41,10 +7,6 @@ RUN apt-get update \
     && apt-get autoremove -y \
     && rm -rf /var/lib/apt/lists/*
 
-# Install flash attention (from pre-built wheel)
-RUN --mount=type=bind,from=flash-attn-builder,src=/usr/src/flash-attention-v2,target=/usr/src/flash-attention-v2 \
-    pip install /usr/src/flash-attention-v2/*.whl --no-cache-dir
-
 RUN pip uninstall torch -y
 COPY requirements.txt /workspace/requirements.txt
 RUN pip install -r requirements.txt
@@ -53,5 +15,3 @@ RUN wget https://github.com/peak/s5cmd/releases/download/v2.2.1/s5cmd_2.2.1_Linu
 RUN tar -xvzf s5cmd_2.2.1_Linux-64bit.tar.gz
 
 COPY vllm_server.py /workspace/vllm_server.py
-
-#################### Runtime IMAGE ####################
diff --git a/model-engine/model_engine_server/inference/vllm/requirements-build.txt b/model-engine/model_engine_server/inference/vllm/requirements-build.txt
diff --git a/model-engine/model_engine_server/inference/vllm/requirements.txt b/model-engine/model_engine_server/inference/vllm/requirements.txt
@@ -1,2 +1,2 @@
-vllm==0.4.2
+vllm==0.5.0.post1
 pydantic>=2.0
diff --git a/model-engine/model_engine_server/inference/vllm/vllm_server.py b/model-engine/model_engine_server/inference/vllm/vllm_server.py
@@ -11,7 +11,7 @@
 from fastapi import BackgroundTasks, FastAPI, HTTPException, Request
 from fastapi.responses import Response, StreamingResponse
 from vllm.engine.arg_utils import AsyncEngineArgs
-from vllm.engine.async_llm_engine import AsyncLLMEngine
+from vllm.engine.async_llm_engine import AsyncEngineDeadError, AsyncLLMEngine
 from vllm.entrypoints.openai.protocol import CompletionRequest as OpenAICompletionRequest
 from vllm.model_executor.guided_decoding import get_guided_decoding_logits_processor
 from vllm.outputs import CompletionOutput
@@ -43,97 +43,101 @@ async def generate(request: Request) -> Response:
     # check health before accepting request and fail fast if engine isn't healthy
     try:
         await engine.check_health()
-    except Exception as e:
-        print(f"The vllm engine is dead, exiting the pod: {e}")
-        os.kill(os.getpid(), signal.SIGINT)
 
-    request_dict = await request.json()
-    prompt = request_dict.pop("prompt")
-    stream = request_dict.pop("stream", False)
-    guided_json = request_dict.pop("guided_json", None)
-    guided_regex = request_dict.pop("guided_regex", None)
-    guided_choice = request_dict.pop("guided_choice", None)
-    guided_grammar = request_dict.pop("guided_grammar", None)
-    sampling_params = SamplingParams(**request_dict)
+        request_dict = await request.json()
+        prompt = request_dict.pop("prompt")
+        stream = request_dict.pop("stream", False)
+        guided_json = request_dict.pop("guided_json", None)
+        guided_regex = request_dict.pop("guided_regex", None)
+        guided_choice = request_dict.pop("guided_choice", None)
+        guided_grammar = request_dict.pop("guided_grammar", None)
+        sampling_params = SamplingParams(**request_dict)
+
+        # Dummy request to get guided decode logit processor
+        try:
+            partial_openai_request = OpenAICompletionRequest.model_validate(
+                {
+                    "model": "",
+                    "prompt": "",
+                    "guided_json": guided_json,
+                    "guided_regex": guided_regex,
+                    "guided_choice": guided_choice,
+                    "guided_grammar": guided_grammar,
+                }
+            )
+        except Exception:
+            raise HTTPException(
+                status_code=400, detail="Bad request: failed to parse guided decoding parameters."
+            )
 
-    # Dummy request to get guided decode logit processor
-    try:
-        partial_openai_request = OpenAICompletionRequest.model_validate(
-            {
-                "model": "",
-                "prompt": "",
-                "guided_json": guided_json,
-                "guided_regex": guided_regex,
-                "guided_choice": guided_choice,
-                "guided_grammar": guided_grammar,
-            }
-        )
-    except Exception:
-        raise HTTPException(
-            status_code=400, detail="Bad request: failed to parse guided decoding parameters."
+        guided_decoding_backend = engine.engine.decoding_config.guided_decoding_backend
+        guided_decode_logit_processor = await get_guided_decoding_logits_processor(
+            guided_decoding_backend, partial_openai_request, await engine.get_tokenizer()
         )
+        if guided_decode_logit_processor is not None:
+            if sampling_params.logits_processors is None:
+                sampling_params.logits_processors = []
+            sampling_params.logits_processors.append(guided_decode_logit_processor)
 
-    guided_decoding_backend = engine.engine.decoding_config.guided_decoding_backend
-    guided_decode_logit_processor = await get_guided_decoding_logits_processor(
-        guided_decoding_backend, partial_openai_request, await engine.get_tokenizer()
-    )
-    if guided_decode_logit_processor is not None:
-        if sampling_params.logits_processors is None:
-            sampling_params.logits_processors = []
-        sampling_params.logits_processors.append(guided_decode_logit_processor)
-
-    request_id = random_uuid()
-
-    results_generator = engine.generate(prompt, sampling_params, request_id)
-
-    async def abort_request() -> None:
-        await engine.abort(request_id)
-
-    if stream:
-        # Streaming case
-        async def stream_results() -> AsyncGenerator[str, None]:
-            last_output_text = ""
-            async for request_output in results_generator:
-                log_probs = format_logprobs(request_output)
-                ret = {
-                    "text": request_output.outputs[-1].text[len(last_output_text) :],
-                    "count_prompt_tokens": len(request_output.prompt_token_ids),
-                    "count_output_tokens": len(request_output.outputs[0].token_ids),
-                    "log_probs": log_probs[-1] if log_probs and sampling_params.logprobs else None,
-                    "finished": request_output.finished,
-                }
-                last_output_text = request_output.outputs[-1].text
-                yield f"data:{json.dumps(ret)}\n\n"
-
-        background_tasks = BackgroundTasks()
-        # Abort the request if the client disconnects.
-        background_tasks.add_task(abort_request)
-
-        return StreamingResponse(stream_results(), background=background_tasks)
-
-    # Non-streaming case
-    final_output = None
-    tokens = []
-    last_output_text = ""
-    async for request_output in results_generator:
-        tokens.append(request_output.outputs[-1].text[len(last_output_text) :])
-        last_output_text = request_output.outputs[-1].text
-        if await request.is_disconnected():
-            # Abort the request if the client disconnects.
+        request_id = random_uuid()
+
+        results_generator = engine.generate(prompt, sampling_params, request_id)
+
+        async def abort_request() -> None:
             await engine.abort(request_id)
-            return Response(status_code=499)
-        final_output = request_output
 
-    assert final_output is not None
-    prompt = final_output.prompt
-    ret = {
-        "text": final_output.outputs[0].text,
-        "count_prompt_tokens": len(final_output.prompt_token_ids),
-        "count_output_tokens": len(final_output.outputs[0].token_ids),
-        "log_probs": format_logprobs(final_output),
-        "tokens": tokens,
-    }
-    return Response(content=json.dumps(ret))
+        if stream:
+            # Streaming case
+            async def stream_results() -> AsyncGenerator[str, None]:
+                last_output_text = ""
+                async for request_output in results_generator:
+                    log_probs = format_logprobs(request_output)
+                    ret = {
+                        "text": request_output.outputs[-1].text[len(last_output_text) :],
+                        "count_prompt_tokens": len(request_output.prompt_token_ids),
+                        "count_output_tokens": len(request_output.outputs[0].token_ids),
+                        "log_probs": log_probs[-1]
+                        if log_probs and sampling_params.logprobs
+                        else None,
+                        "finished": request_output.finished,
+                    }
+                    last_output_text = request_output.outputs[-1].text
+                    yield f"data:{json.dumps(ret)}\n\n"
+
+            background_tasks = BackgroundTasks()
+            # Abort the request if the client disconnects.
+            background_tasks.add_task(abort_request)
+
+            return StreamingResponse(stream_results(), background=background_tasks)
+
+        # Non-streaming case
+        final_output = None
+        tokens = []
+        last_output_text = ""
+        async for request_output in results_generator:
+            tokens.append(request_output.outputs[-1].text[len(last_output_text) :])
+            last_output_text = request_output.outputs[-1].text
+            if await request.is_disconnected():
+                # Abort the request if the client disconnects.
+                await engine.abort(request_id)
+                return Response(status_code=499)
+            final_output = request_output
+
+        assert final_output is not None
+        prompt = final_output.prompt
+        ret = {
+            "text": final_output.outputs[0].text,
+            "count_prompt_tokens": len(final_output.prompt_token_ids),
+            "count_output_tokens": len(final_output.outputs[0].token_ids),
+            "log_probs": format_logprobs(final_output),
+            "tokens": tokens,
+        }
+        return Response(content=json.dumps(ret))
+
+    except AsyncEngineDeadError as e:
+        print(f"The vllm engine is dead, exiting the pod: {e}")
+        os.kill(os.getpid(), signal.SIGINT)
+        raise e
 
 
 def get_gpu_free_memory():
@@ -206,7 +210,6 @@ def extract_logprobs(logprobs: Dict[int, Logprob]) -> Dict[int, float]:
 
     engine_args = AsyncEngineArgs.from_cli_args(args)
     engine = AsyncLLMEngine.from_engine_args(engine_args)
-    engine.check_health()
 
     signal.signal(signal.SIGUSR1, debug)
 

Original file line number	Diff line number	Diff line change
`@@ -1,2 +1,2 @@`
`1`		`-vllm==0.4.2`
	`1`	`+vllm==0.5.0.post1`
`2`	`2`	`pydantic>=2.0`