1111import vllm .envs as envs
1212from fastapi import APIRouter , BackgroundTasks , Request
1313from fastapi .responses import Response , StreamingResponse
14- from vllm .engine .async_llm_engine import (
15- AsyncEngineDeadError ,
16- build_guided_decoding_logits_processor_async ,
17- )
14+ from vllm .engine .async_llm_engine import AsyncEngineDeadError
1815from vllm .engine .protocol import EngineClient
1916from vllm .entrypoints .launcher import serve_http
2017from vllm .entrypoints .openai .api_server import (
@@ -60,16 +57,7 @@ async def generate(request: Request) -> Response:
6057 prompt = request_dict .pop ("prompt" )
6158 stream = request_dict .pop ("stream" , False )
6259
63- guided_decoding_backend = (
64- await engine_client .get_decoding_config ()
65- ).guided_decoding_backend
66-
67- sampling_params = await build_guided_decoding_logits_processor_async (
68- sampling_params = SamplingParams (** request_dict ),
69- tokenizer = await engine_client .get_tokenizer (lora_request = None ),
70- default_guided_backend = guided_decoding_backend ,
71- model_config = await engine_client .get_model_config (),
72- )
60+ sampling_params = SamplingParams (** request_dict )
7361
7462 request_id = random_uuid ()
7563
@@ -226,7 +214,7 @@ async def run_server_worker(
226214
227215 global engine_client
228216
229- async with build_async_engine_client (args , client_config ) as engine_client :
217+ async with build_async_engine_client (args , client_config = client_config ) as engine_client :
230218 maybe_register_tokenizer_info_endpoint (args )
231219 app = build_app (args )
232220
@@ -250,6 +238,8 @@ async def run_server_worker(
250238 ssl_certfile = args .ssl_certfile ,
251239 ssl_ca_certs = args .ssl_ca_certs ,
252240 ssl_cert_reqs = args .ssl_cert_reqs ,
241+ h11_max_incomplete_event_size = args .h11_max_incomplete_event_size ,
242+ h11_max_header_count = args .h11_max_header_count ,
253243 ** uvicorn_kwargs ,
254244 )
255245
0 commit comments