|
11 | 11 | from fastapi import BackgroundTasks, FastAPI, HTTPException, Request |
12 | 12 | from fastapi.responses import Response, StreamingResponse |
13 | 13 | from vllm.engine.arg_utils import AsyncEngineArgs |
14 | | -from vllm.engine.async_llm_engine import AsyncLLMEngine |
| 14 | +from vllm.engine.async_llm_engine import AsyncEngineDeadError, AsyncLLMEngine |
15 | 15 | from vllm.entrypoints.openai.protocol import CompletionRequest as OpenAICompletionRequest |
16 | 16 | from vllm.model_executor.guided_decoding import get_guided_decoding_logits_processor |
17 | 17 | from vllm.outputs import CompletionOutput |
@@ -43,97 +43,101 @@ async def generate(request: Request) -> Response: |
43 | 43 | # check health before accepting request and fail fast if engine isn't healthy |
44 | 44 | try: |
45 | 45 | await engine.check_health() |
46 | | - except Exception as e: |
47 | | - print(f"The vllm engine is dead, exiting the pod: {e}") |
48 | | - os.kill(os.getpid(), signal.SIGINT) |
49 | 46 |
|
50 | | - request_dict = await request.json() |
51 | | - prompt = request_dict.pop("prompt") |
52 | | - stream = request_dict.pop("stream", False) |
53 | | - guided_json = request_dict.pop("guided_json", None) |
54 | | - guided_regex = request_dict.pop("guided_regex", None) |
55 | | - guided_choice = request_dict.pop("guided_choice", None) |
56 | | - guided_grammar = request_dict.pop("guided_grammar", None) |
57 | | - sampling_params = SamplingParams(**request_dict) |
| 47 | + request_dict = await request.json() |
| 48 | + prompt = request_dict.pop("prompt") |
| 49 | + stream = request_dict.pop("stream", False) |
| 50 | + guided_json = request_dict.pop("guided_json", None) |
| 51 | + guided_regex = request_dict.pop("guided_regex", None) |
| 52 | + guided_choice = request_dict.pop("guided_choice", None) |
| 53 | + guided_grammar = request_dict.pop("guided_grammar", None) |
| 54 | + sampling_params = SamplingParams(**request_dict) |
| 55 | + |
| 56 | + # Dummy request to get guided decode logit processor |
| 57 | + try: |
| 58 | + partial_openai_request = OpenAICompletionRequest.model_validate( |
| 59 | + { |
| 60 | + "model": "", |
| 61 | + "prompt": "", |
| 62 | + "guided_json": guided_json, |
| 63 | + "guided_regex": guided_regex, |
| 64 | + "guided_choice": guided_choice, |
| 65 | + "guided_grammar": guided_grammar, |
| 66 | + } |
| 67 | + ) |
| 68 | + except Exception: |
| 69 | + raise HTTPException( |
| 70 | + status_code=400, detail="Bad request: failed to parse guided decoding parameters." |
| 71 | + ) |
58 | 72 |
|
59 | | - # Dummy request to get guided decode logit processor |
60 | | - try: |
61 | | - partial_openai_request = OpenAICompletionRequest.model_validate( |
62 | | - { |
63 | | - "model": "", |
64 | | - "prompt": "", |
65 | | - "guided_json": guided_json, |
66 | | - "guided_regex": guided_regex, |
67 | | - "guided_choice": guided_choice, |
68 | | - "guided_grammar": guided_grammar, |
69 | | - } |
70 | | - ) |
71 | | - except Exception: |
72 | | - raise HTTPException( |
73 | | - status_code=400, detail="Bad request: failed to parse guided decoding parameters." |
| 73 | + guided_decoding_backend = engine.engine.decoding_config.guided_decoding_backend |
| 74 | + guided_decode_logit_processor = await get_guided_decoding_logits_processor( |
| 75 | + guided_decoding_backend, partial_openai_request, await engine.get_tokenizer() |
74 | 76 | ) |
| 77 | + if guided_decode_logit_processor is not None: |
| 78 | + if sampling_params.logits_processors is None: |
| 79 | + sampling_params.logits_processors = [] |
| 80 | + sampling_params.logits_processors.append(guided_decode_logit_processor) |
75 | 81 |
|
76 | | - guided_decoding_backend = engine.engine.decoding_config.guided_decoding_backend |
77 | | - guided_decode_logit_processor = await get_guided_decoding_logits_processor( |
78 | | - guided_decoding_backend, partial_openai_request, await engine.get_tokenizer() |
79 | | - ) |
80 | | - if guided_decode_logit_processor is not None: |
81 | | - if sampling_params.logits_processors is None: |
82 | | - sampling_params.logits_processors = [] |
83 | | - sampling_params.logits_processors.append(guided_decode_logit_processor) |
84 | | - |
85 | | - request_id = random_uuid() |
86 | | - |
87 | | - results_generator = engine.generate(prompt, sampling_params, request_id) |
88 | | - |
89 | | - async def abort_request() -> None: |
90 | | - await engine.abort(request_id) |
91 | | - |
92 | | - if stream: |
93 | | - # Streaming case |
94 | | - async def stream_results() -> AsyncGenerator[str, None]: |
95 | | - last_output_text = "" |
96 | | - async for request_output in results_generator: |
97 | | - log_probs = format_logprobs(request_output) |
98 | | - ret = { |
99 | | - "text": request_output.outputs[-1].text[len(last_output_text) :], |
100 | | - "count_prompt_tokens": len(request_output.prompt_token_ids), |
101 | | - "count_output_tokens": len(request_output.outputs[0].token_ids), |
102 | | - "log_probs": log_probs[-1] if log_probs and sampling_params.logprobs else None, |
103 | | - "finished": request_output.finished, |
104 | | - } |
105 | | - last_output_text = request_output.outputs[-1].text |
106 | | - yield f"data:{json.dumps(ret)}\n\n" |
107 | | - |
108 | | - background_tasks = BackgroundTasks() |
109 | | - # Abort the request if the client disconnects. |
110 | | - background_tasks.add_task(abort_request) |
111 | | - |
112 | | - return StreamingResponse(stream_results(), background=background_tasks) |
113 | | - |
114 | | - # Non-streaming case |
115 | | - final_output = None |
116 | | - tokens = [] |
117 | | - last_output_text = "" |
118 | | - async for request_output in results_generator: |
119 | | - tokens.append(request_output.outputs[-1].text[len(last_output_text) :]) |
120 | | - last_output_text = request_output.outputs[-1].text |
121 | | - if await request.is_disconnected(): |
122 | | - # Abort the request if the client disconnects. |
| 82 | + request_id = random_uuid() |
| 83 | + |
| 84 | + results_generator = engine.generate(prompt, sampling_params, request_id) |
| 85 | + |
| 86 | + async def abort_request() -> None: |
123 | 87 | await engine.abort(request_id) |
124 | | - return Response(status_code=499) |
125 | | - final_output = request_output |
126 | 88 |
|
127 | | - assert final_output is not None |
128 | | - prompt = final_output.prompt |
129 | | - ret = { |
130 | | - "text": final_output.outputs[0].text, |
131 | | - "count_prompt_tokens": len(final_output.prompt_token_ids), |
132 | | - "count_output_tokens": len(final_output.outputs[0].token_ids), |
133 | | - "log_probs": format_logprobs(final_output), |
134 | | - "tokens": tokens, |
135 | | - } |
136 | | - return Response(content=json.dumps(ret)) |
| 89 | + if stream: |
| 90 | + # Streaming case |
| 91 | + async def stream_results() -> AsyncGenerator[str, None]: |
| 92 | + last_output_text = "" |
| 93 | + async for request_output in results_generator: |
| 94 | + log_probs = format_logprobs(request_output) |
| 95 | + ret = { |
| 96 | + "text": request_output.outputs[-1].text[len(last_output_text) :], |
| 97 | + "count_prompt_tokens": len(request_output.prompt_token_ids), |
| 98 | + "count_output_tokens": len(request_output.outputs[0].token_ids), |
| 99 | + "log_probs": log_probs[-1] |
| 100 | + if log_probs and sampling_params.logprobs |
| 101 | + else None, |
| 102 | + "finished": request_output.finished, |
| 103 | + } |
| 104 | + last_output_text = request_output.outputs[-1].text |
| 105 | + yield f"data:{json.dumps(ret)}\n\n" |
| 106 | + |
| 107 | + background_tasks = BackgroundTasks() |
| 108 | + # Abort the request if the client disconnects. |
| 109 | + background_tasks.add_task(abort_request) |
| 110 | + |
| 111 | + return StreamingResponse(stream_results(), background=background_tasks) |
| 112 | + |
| 113 | + # Non-streaming case |
| 114 | + final_output = None |
| 115 | + tokens = [] |
| 116 | + last_output_text = "" |
| 117 | + async for request_output in results_generator: |
| 118 | + tokens.append(request_output.outputs[-1].text[len(last_output_text) :]) |
| 119 | + last_output_text = request_output.outputs[-1].text |
| 120 | + if await request.is_disconnected(): |
| 121 | + # Abort the request if the client disconnects. |
| 122 | + await engine.abort(request_id) |
| 123 | + return Response(status_code=499) |
| 124 | + final_output = request_output |
| 125 | + |
| 126 | + assert final_output is not None |
| 127 | + prompt = final_output.prompt |
| 128 | + ret = { |
| 129 | + "text": final_output.outputs[0].text, |
| 130 | + "count_prompt_tokens": len(final_output.prompt_token_ids), |
| 131 | + "count_output_tokens": len(final_output.outputs[0].token_ids), |
| 132 | + "log_probs": format_logprobs(final_output), |
| 133 | + "tokens": tokens, |
| 134 | + } |
| 135 | + return Response(content=json.dumps(ret)) |
| 136 | + |
| 137 | + except AsyncEngineDeadError as e: |
| 138 | + print(f"The vllm engine is dead, exiting the pod: {e}") |
| 139 | + os.kill(os.getpid(), signal.SIGINT) |
| 140 | + raise e |
137 | 141 |
|
138 | 142 |
|
139 | 143 | def get_gpu_free_memory(): |
@@ -206,7 +210,6 @@ def extract_logprobs(logprobs: Dict[int, Logprob]) -> Dict[int, float]: |
206 | 210 |
|
207 | 211 | engine_args = AsyncEngineArgs.from_cli_args(args) |
208 | 212 | engine = AsyncLLMEngine.from_engine_args(engine_args) |
209 | | - engine.check_health() |
210 | 213 |
|
211 | 214 | signal.signal(signal.SIGUSR1, debug) |
212 | 215 |
|
|
0 commit comments